Enhancement: rank autocomplete suggestions by document frequency

Replace set-based alphabetical autocomplete with Counter-based
document-frequency ordering. Words appearing in more of the user's
visible documents rank first — the same signal Whoosh used for
Tf/Idf-based ordering, computed permission-correctly from already-
fetched stored values at no extra index cost.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-30 13:25:56 -07:00
parent b626f5602c
commit b10f3de2eb
3 changed files with 40 additions and 24 deletions

View File

@@ -1,9 +1,9 @@
from __future__ import annotations
import bisect
import logging
import threading
import unicodedata
from collections import Counter
from dataclasses import dataclass
from datetime import UTC
from datetime import datetime
@@ -524,8 +524,11 @@ class TantivyBackend:
results = searcher.search(base_query, limit=10000)
# Collect all autocomplete words
words = set()
# Count how many visible documents each word appears in.
# Using Counter (not set) preserves per-word document frequency so
# we can rank suggestions by how commonly they occur — the same
# signal Whoosh used for Tf/Idf-based autocomplete ordering.
word_counts: Counter[str] = Counter()
for hit in results.hits:
# hits are (score, doc_address) tuples
doc_address = hit[1] if len(hit) == 2 else hit[0]
@@ -533,27 +536,16 @@ class TantivyBackend:
stored_doc = searcher.doc(doc_address)
doc_dict = stored_doc.to_dict()
if "autocomplete_word" in doc_dict:
for word in doc_dict["autocomplete_word"]:
words.add(word)
word_counts.update(doc_dict["autocomplete_word"])
# Sort and find matches
sorted_words = sorted(words)
# Filter to prefix matches, then sort by document frequency descending
# so the most-used matching word comes first.
matches = sorted(
(w for w in word_counts if w.startswith(normalized_term)),
key=lambda w: -word_counts[w],
)
# Use binary search to find starting position
start_idx = bisect.bisect_left(sorted_words, normalized_term)
# Collect matching words
matches = []
for i in range(start_idx, len(sorted_words)):
word = sorted_words[i]
if word.startswith(normalized_term):
matches.append(word)
if len(matches) >= limit:
break
else:
break
return matches
return matches[:limit]
def more_like_this(
self,

View File

@@ -137,6 +137,30 @@ class TestAutocomplete:
results = backend.autocomplete("micro", limit=10)
assert "microsoft" in results
def test_results_ordered_by_document_frequency(self, backend: TantivyBackend):
"""Most-used prefix match should rank first."""
# "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should
# return "payment" before "payslip".
for i, (title, checksum) in enumerate(
[
("payment invoice", "AF1"),
("payment receipt", "AF2"),
("payment confirmation", "AF3"),
("payslip march", "AF4"),
],
start=41,
):
doc = Document.objects.create(
title=title,
content="details",
checksum=checksum,
pk=i,
)
backend.add_or_update(doc)
results = backend.autocomplete("pay", limit=10)
assert results.index("payment") < results.index("payslip")
class TestMoreLikeThis:
"""Test more like this functionality."""