perf: pre-filter autocomplete candidates with regex prefix query

2026-04-03 06:38:51 +00:00 · 2026-04-02 20:04:10 -07:00
parent 4f84282ef3
commit 733559413e
2 changed files with 19 additions and 4 deletions
--- a/src/documents/search/_backend.py
+++ b/src/documents/search/_backend.py
@@ -605,11 +605,26 @@ class TantivyBackend:
        # Apply permission filter for non-superusers so autocomplete words
        # from invisible documents don't leak to other users.
        if user is not None and not user.is_superuser:
-            base_query = build_permission_filter(self._schema, user)
+            permission_query = build_permission_filter(self._schema, user)
        else:
-            base_query = tantivy.Query.all_query()
+            permission_query = tantivy.Query.all_query()

-        results = searcher.search(base_query, limit=10000)
+        # Narrow to documents that actually contain a word starting with the
+        # prefix before loading stored fields, avoiding a full collection scan.
+        prefix_filter = tantivy.Query.regex_query(
+            self._schema,
+            "autocomplete_word",
+            regex.escape(normalized_term) + ".*",
+        )
+        results = searcher.search(
+            tantivy.Query.boolean_query(
+                [
+                    (tantivy.Occur.Must, permission_query),
+                    (tantivy.Occur.Must, prefix_filter),
+                ],
+            ),
+            limit=10000,
+        )

        # Count how many visible documents each word appears in.
        # Using Counter (not set) preserves per-word document frequency so
--- a/src/documents/search/_schema.py
+++ b/src/documents/search/_schema.py
@@ -53,7 +53,7 @@ def build_schema() -> tantivy.Schema:
    # CJK support - not stored, indexed only
    sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer")

-    # Autocomplete prefix scan - stored, not indexed
+    # Autocomplete prefix scan - stored, indexed by default
    sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw")

    sb.add_text_field("tag", stored=True, tokenizer_name="paperless_text")