mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-08 06:39:46 +00:00
Adds notes for where we can improve, if either fixes, features or a new release drop in from Tantivy
This commit is contained in:
@@ -224,10 +224,13 @@ class WriteBatch:
|
||||
"""
|
||||
Remove a document from the batch by its primary key.
|
||||
|
||||
Uses range query instead of term query to work around unsigned integer
|
||||
type detection bug in tantivy-py 0.25.
|
||||
Uses range_query instead of term_query to work around a tantivy-py bug
|
||||
where Python integers are inferred as i64, producing Terms that never
|
||||
match u64 fields.
|
||||
|
||||
TODO: Replace with term_query("id", doc_id) once
|
||||
https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
|
||||
"""
|
||||
# Use range query to work around u64 deletion bug
|
||||
self._writer.delete_documents_by_query(
|
||||
tantivy.Query.range_query(
|
||||
self._backend._schema,
|
||||
@@ -518,10 +521,14 @@ class TantivyBackend:
|
||||
search_ids + ORM filtering) and just need highlight data.
|
||||
|
||||
Note: Each doc_id requires an individual index lookup because tantivy-py
|
||||
does not expose a batch doc-address-by-ID API. This is acceptable for
|
||||
does not yet expose a batch fast-field read API. This is acceptable for
|
||||
page-sized batches (typically 25 docs) but should not be called with
|
||||
thousands of IDs.
|
||||
|
||||
TODO: When https://github.com/quickwit-oss/tantivy-py/pull/641 lands,
|
||||
the per-doc range_query lookups here can be replaced with a single
|
||||
collect_u64_fast_field("id", doc_addresses) call.
|
||||
|
||||
Args:
|
||||
query: The search query (used for snippet generation)
|
||||
doc_ids: Ordered list of document IDs to generate hits for
|
||||
@@ -665,6 +672,9 @@ class TantivyBackend:
|
||||
if threshold is not None:
|
||||
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
|
||||
|
||||
# TODO: Replace with searcher.collect_u64_fast_field("id", addrs) once
|
||||
# https://github.com/quickwit-oss/tantivy-py/pull/641 lands — eliminates
|
||||
# one stored-doc fetch per result (~80% reduction in search_ids latency).
|
||||
return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]
|
||||
|
||||
def autocomplete(
|
||||
@@ -812,6 +822,8 @@ class TantivyBackend:
|
||||
# Fetch one extra to account for excluding the original document
|
||||
results = searcher.search(final_query, limit=effective_limit + 1)
|
||||
|
||||
# TODO: Replace with collect_u64_fast_field("id", addrs) once
|
||||
# https://github.com/quickwit-oss/tantivy-py/pull/641 lands.
|
||||
ids = []
|
||||
for _score, doc_address in results.hits:
|
||||
result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
|
||||
|
||||
@@ -396,10 +396,17 @@ def build_permission_filter(
|
||||
Tantivy query that filters results to visible documents
|
||||
|
||||
Implementation Notes:
|
||||
- Uses range_query instead of term_query to work around unsigned integer
|
||||
type detection bug in tantivy-py 0.25
|
||||
- Uses boolean_query for "no owner" check since exists_query is not
|
||||
available in tantivy-py 0.25.1 (available in master)
|
||||
- Uses range_query instead of term_query for owner_id/viewer_id to work
|
||||
around a tantivy-py bug where Python ints are inferred as i64, causing
|
||||
term_query to return no hits on u64 fields.
|
||||
TODO: Replace with term_query once
|
||||
https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
|
||||
|
||||
- Uses range_query(owner_id, 1, MAX_U64) as an "owner exists" check
|
||||
because exists_query is not yet available in tantivy-py 0.25.
|
||||
TODO: Replace with exists_query("owner_id") once that is exposed in
|
||||
a tantivy-py release.
|
||||
|
||||
- Uses disjunction_max_query to combine permission clauses with OR logic
|
||||
"""
|
||||
owner_any = tantivy.Query.range_query(
|
||||
|
||||
Reference in New Issue
Block a user