mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-23 06:55:23 +00:00
perf: use prefix query in autocomplete to avoid full-index scan
Previously autocomplete scanned every visible document to extract words, then filtered by prefix in Python. Now builds a regex query on autocomplete_word so Tantivy only returns docs containing matching words. At 5k docs: rare prefixes go from 335ms to <1ms, common prefixes from 342ms to 199ms with 58-99% less peak memory. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import threading
|
||||
from collections import Counter
|
||||
from datetime import UTC
|
||||
@@ -682,30 +683,60 @@ class TantivyBackend:
|
||||
|
||||
searcher = self._index.searcher()
|
||||
|
||||
# Apply permission filter for non-superusers so autocomplete words
|
||||
# from invisible documents don't leak to other users.
|
||||
# Build a prefix query on autocomplete_word so we only scan docs
|
||||
# containing words that start with the prefix, not the entire index.
|
||||
# tantivy regex is implicitly anchored; .+ avoids the empty-match
|
||||
# error that .* triggers. We OR with term_query to also match the
|
||||
# exact prefix as a complete word.
|
||||
escaped = re.escape(normalized_term)
|
||||
prefix_query = tantivy.Query.boolean_query(
|
||||
[
|
||||
(
|
||||
tantivy.Occur.Should,
|
||||
tantivy.Query.term_query(
|
||||
self._schema,
|
||||
"autocomplete_word",
|
||||
normalized_term,
|
||||
),
|
||||
),
|
||||
(
|
||||
tantivy.Occur.Should,
|
||||
tantivy.Query.regex_query(
|
||||
self._schema,
|
||||
"autocomplete_word",
|
||||
f"{escaped}.+",
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
# Intersect with permission filter so autocomplete words from
|
||||
# invisible documents don't leak to other users.
|
||||
if user is not None and not user.is_superuser:
|
||||
base_query = build_permission_filter(self._schema, user)
|
||||
final_query = tantivy.Query.boolean_query(
|
||||
[
|
||||
(tantivy.Occur.Must, prefix_query),
|
||||
(tantivy.Occur.Must, build_permission_filter(self._schema, user)),
|
||||
],
|
||||
)
|
||||
else:
|
||||
base_query = tantivy.Query.all_query()
|
||||
final_query = prefix_query
|
||||
|
||||
results = searcher.search(base_query, limit=searcher.num_docs)
|
||||
results = searcher.search(final_query, limit=searcher.num_docs)
|
||||
|
||||
# Count how many visible documents each word appears in.
|
||||
# Using Counter (not set) preserves per-word document frequency so
|
||||
# we can rank suggestions by how commonly they occur — the same
|
||||
# signal Whoosh used for Tf/Idf-based autocomplete ordering.
|
||||
# Count how many visible documents each matching word appears in.
|
||||
word_counts: Counter[str] = Counter()
|
||||
for _score, doc_address in results.hits:
|
||||
stored_doc = searcher.doc(doc_address)
|
||||
doc_dict = stored_doc.to_dict()
|
||||
if "autocomplete_word" in doc_dict:
|
||||
word_counts.update(doc_dict["autocomplete_word"])
|
||||
for word in doc_dict["autocomplete_word"]:
|
||||
if word.startswith(normalized_term):
|
||||
word_counts[word] += 1
|
||||
|
||||
# Filter to prefix matches, sort by document frequency descending;
|
||||
# break ties alphabetically for stable, deterministic output.
|
||||
# Sort by document frequency descending; break ties alphabetically.
|
||||
matches = sorted(
|
||||
(w for w in word_counts if w.startswith(normalized_term)),
|
||||
word_counts,
|
||||
key=lambda w: (-word_counts[w], w),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user