perf: pre-filter autocomplete candidates with regex prefix query

This commit is contained in:
Trenton Holmes
2026-04-02 20:04:10 -07:00
parent 4f84282ef3
commit 733559413e
2 changed files with 19 additions and 4 deletions

View File

@@ -605,11 +605,26 @@ class TantivyBackend:
# Apply permission filter for non-superusers so autocomplete words
# from invisible documents don't leak to other users.
if user is not None and not user.is_superuser:
base_query = build_permission_filter(self._schema, user)
permission_query = build_permission_filter(self._schema, user)
else:
base_query = tantivy.Query.all_query()
permission_query = tantivy.Query.all_query()
results = searcher.search(base_query, limit=10000)
# Narrow to documents that actually contain a word starting with the
# prefix before loading stored fields, avoiding a full collection scan.
prefix_filter = tantivy.Query.regex_query(
self._schema,
"autocomplete_word",
regex.escape(normalized_term) + ".*",
)
results = searcher.search(
tantivy.Query.boolean_query(
[
(tantivy.Occur.Must, permission_query),
(tantivy.Occur.Must, prefix_filter),
],
),
limit=10000,
)
# Count how many visible documents each word appears in.
# Using Counter (not set) preserves per-word document frequency so

View File

@@ -53,7 +53,7 @@ def build_schema() -> tantivy.Schema:
# CJK support - not stored, indexed only
sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer")
# Autocomplete prefix scan - stored, not indexed
# Autocomplete prefix scan - stored, indexed by default
sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw")
sb.add_text_field("tag", stored=True, tokenizer_name="paperless_text")