From 733559413ec1f00a8ee76a91c72839e2ee5b97d7 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Thu, 2 Apr 2026 20:04:10 -0700 Subject: [PATCH] perf: pre-filter autocomplete candidates with regex prefix query --- src/documents/search/_backend.py | 21 ++++++++++++++++++--- src/documents/search/_schema.py | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index a15a9370b..6a39f2b7b 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -605,11 +605,26 @@ class TantivyBackend: # Apply permission filter for non-superusers so autocomplete words # from invisible documents don't leak to other users. if user is not None and not user.is_superuser: - base_query = build_permission_filter(self._schema, user) + permission_query = build_permission_filter(self._schema, user) else: - base_query = tantivy.Query.all_query() + permission_query = tantivy.Query.all_query() - results = searcher.search(base_query, limit=10000) + # Narrow to documents that actually contain a word starting with the + # prefix before loading stored fields, avoiding a full collection scan. + prefix_filter = tantivy.Query.regex_query( + self._schema, + "autocomplete_word", + regex.escape(normalized_term) + ".*", + ) + results = searcher.search( + tantivy.Query.boolean_query( + [ + (tantivy.Occur.Must, permission_query), + (tantivy.Occur.Must, prefix_filter), + ], + ), + limit=10000, + ) # Count how many visible documents each word appears in. # Using Counter (not set) preserves per-word document frequency so diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py index ba6646007..4b0cd110d 100644 --- a/src/documents/search/_schema.py +++ b/src/documents/search/_schema.py @@ -53,7 +53,7 @@ def build_schema() -> tantivy.Schema: # CJK support - not stored, indexed only sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer") - # Autocomplete prefix scan - stored, not indexed + # Autocomplete prefix scan - stored, indexed by default sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw") sb.add_text_field("tag", stored=True, tokenizer_name="paperless_text")