Enhancement: rank autocomplete suggestions by document frequency

Replace set-based alphabetical autocomplete with Counter-based
document-frequency ordering. Words appearing in more of the user's
visible documents rank first — the same signal Whoosh used for
Tf/Idf-based ordering, computed permission-correctly from already-
fetched stored values at no extra index cost.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-30 13:25:56 -07:00
parent b626f5602c
commit b10f3de2eb
3 changed files with 40 additions and 24 deletions

View File

@@ -167,8 +167,8 @@ Query parameters:
- `term`: The incomplete term.
- `limit`: Amount of results. Defaults to 10.
Results are ordered alphabetically by prefix match. The first result is
the lexicographically first word in the index that starts with the given term.
Results are ordered by how many of the user's visible documents contain
each matching word. The first result is the word that appears in the most documents.
```json
["term1", "term3", "term6", "term4"]

View File

@@ -1,9 +1,9 @@
from __future__ import annotations
import bisect
import logging
import threading
import unicodedata
from collections import Counter
from dataclasses import dataclass
from datetime import UTC
from datetime import datetime
@@ -524,8 +524,11 @@ class TantivyBackend:
results = searcher.search(base_query, limit=10000)
# Collect all autocomplete words
words = set()
# Count how many visible documents each word appears in.
# Using Counter (not set) preserves per-word document frequency so
# we can rank suggestions by how commonly they occur — the same
# signal Whoosh used for Tf/Idf-based autocomplete ordering.
word_counts: Counter[str] = Counter()
for hit in results.hits:
# hits are (score, doc_address) tuples
doc_address = hit[1] if len(hit) == 2 else hit[0]
@@ -533,27 +536,16 @@ class TantivyBackend:
stored_doc = searcher.doc(doc_address)
doc_dict = stored_doc.to_dict()
if "autocomplete_word" in doc_dict:
for word in doc_dict["autocomplete_word"]:
words.add(word)
word_counts.update(doc_dict["autocomplete_word"])
# Sort and find matches
sorted_words = sorted(words)
# Filter to prefix matches, then sort by document frequency descending
# so the most-used matching word comes first.
matches = sorted(
(w for w in word_counts if w.startswith(normalized_term)),
key=lambda w: -word_counts[w],
)
# Use binary search to find starting position
start_idx = bisect.bisect_left(sorted_words, normalized_term)
# Collect matching words
matches = []
for i in range(start_idx, len(sorted_words)):
word = sorted_words[i]
if word.startswith(normalized_term):
matches.append(word)
if len(matches) >= limit:
break
else:
break
return matches
return matches[:limit]
def more_like_this(
self,

View File

@@ -137,6 +137,30 @@ class TestAutocomplete:
results = backend.autocomplete("micro", limit=10)
assert "microsoft" in results
def test_results_ordered_by_document_frequency(self, backend: TantivyBackend):
"""Most-used prefix match should rank first."""
# "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should
# return "payment" before "payslip".
for i, (title, checksum) in enumerate(
[
("payment invoice", "AF1"),
("payment receipt", "AF2"),
("payment confirmation", "AF3"),
("payslip march", "AF4"),
],
start=41,
):
doc = Document.objects.create(
title=title,
content="details",
checksum=checksum,
pk=i,
)
backend.add_or_update(doc)
results = backend.autocomplete("pay", limit=10)
assert results.index("payment") < results.index("payslip")
class TestMoreLikeThis:
"""Test more like this functionality."""