mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-30 21:02:45 +00:00
Enhancement: rank autocomplete suggestions by document frequency
Replace set-based alphabetical autocomplete with Counter-based document-frequency ordering. Words appearing in more of the user's visible documents rank first — the same signal Whoosh used for Tf/Idf-based ordering, computed permission-correctly from already- fetched stored values at no extra index cost. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -167,8 +167,8 @@ Query parameters:
|
||||
- `term`: The incomplete term.
|
||||
- `limit`: Amount of results. Defaults to 10.
|
||||
|
||||
Results are ordered alphabetically by prefix match. The first result is
|
||||
the lexicographically first word in the index that starts with the given term.
|
||||
Results are ordered by how many of the user's visible documents contain
|
||||
each matching word. The first result is the word that appears in the most documents.
|
||||
|
||||
```json
|
||||
["term1", "term3", "term6", "term4"]
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import bisect
|
||||
import logging
|
||||
import threading
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC
|
||||
from datetime import datetime
|
||||
@@ -524,8 +524,11 @@ class TantivyBackend:
|
||||
|
||||
results = searcher.search(base_query, limit=10000)
|
||||
|
||||
# Collect all autocomplete words
|
||||
words = set()
|
||||
# Count how many visible documents each word appears in.
|
||||
# Using Counter (not set) preserves per-word document frequency so
|
||||
# we can rank suggestions by how commonly they occur — the same
|
||||
# signal Whoosh used for Tf/Idf-based autocomplete ordering.
|
||||
word_counts: Counter[str] = Counter()
|
||||
for hit in results.hits:
|
||||
# hits are (score, doc_address) tuples
|
||||
doc_address = hit[1] if len(hit) == 2 else hit[0]
|
||||
@@ -533,27 +536,16 @@ class TantivyBackend:
|
||||
stored_doc = searcher.doc(doc_address)
|
||||
doc_dict = stored_doc.to_dict()
|
||||
if "autocomplete_word" in doc_dict:
|
||||
for word in doc_dict["autocomplete_word"]:
|
||||
words.add(word)
|
||||
word_counts.update(doc_dict["autocomplete_word"])
|
||||
|
||||
# Sort and find matches
|
||||
sorted_words = sorted(words)
|
||||
# Filter to prefix matches, then sort by document frequency descending
|
||||
# so the most-used matching word comes first.
|
||||
matches = sorted(
|
||||
(w for w in word_counts if w.startswith(normalized_term)),
|
||||
key=lambda w: -word_counts[w],
|
||||
)
|
||||
|
||||
# Use binary search to find starting position
|
||||
start_idx = bisect.bisect_left(sorted_words, normalized_term)
|
||||
|
||||
# Collect matching words
|
||||
matches = []
|
||||
for i in range(start_idx, len(sorted_words)):
|
||||
word = sorted_words[i]
|
||||
if word.startswith(normalized_term):
|
||||
matches.append(word)
|
||||
if len(matches) >= limit:
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
return matches
|
||||
return matches[:limit]
|
||||
|
||||
def more_like_this(
|
||||
self,
|
||||
|
||||
@@ -137,6 +137,30 @@ class TestAutocomplete:
|
||||
results = backend.autocomplete("micro", limit=10)
|
||||
assert "microsoft" in results
|
||||
|
||||
def test_results_ordered_by_document_frequency(self, backend: TantivyBackend):
|
||||
"""Most-used prefix match should rank first."""
|
||||
# "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should
|
||||
# return "payment" before "payslip".
|
||||
for i, (title, checksum) in enumerate(
|
||||
[
|
||||
("payment invoice", "AF1"),
|
||||
("payment receipt", "AF2"),
|
||||
("payment confirmation", "AF3"),
|
||||
("payslip march", "AF4"),
|
||||
],
|
||||
start=41,
|
||||
):
|
||||
doc = Document.objects.create(
|
||||
title=title,
|
||||
content="details",
|
||||
checksum=checksum,
|
||||
pk=i,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
results = backend.autocomplete("pay", limit=10)
|
||||
assert results.index("payment") < results.index("payslip")
|
||||
|
||||
|
||||
class TestMoreLikeThis:
|
||||
"""Test more like this functionality."""
|
||||
|
||||
Reference in New Issue
Block a user