Enhancement: rank autocomplete suggestions by document frequency

Replace set-based alphabetical autocomplete with Counter-based document-frequency ordering. Words appearing in more of the user's visible documents rank first — the same signal Whoosh used for Tf/Idf-based ordering, computed permission-correctly from already- fetched stored values at no extra index cost. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-07 22:29:45 +00:00 · 2026-03-30 13:25:56 -07:00
parent b626f5602c
commit b10f3de2eb
3 changed files with 40 additions and 24 deletions
@@ -1,9 +1,9 @@
 from __future__ import annotations

-import bisect
 import logging
 import threading
 import unicodedata
+from collections import Counter
 from dataclasses import dataclass
 from datetime import UTC
 from datetime import datetime
@@ -524,8 +524,11 @@ class TantivyBackend:

        results = searcher.search(base_query, limit=10000)

-        # Collect all autocomplete words
-        words = set()
+        # Count how many visible documents each word appears in.
+        # Using Counter (not set) preserves per-word document frequency so
+        # we can rank suggestions by how commonly they occur — the same
+        # signal Whoosh used for Tf/Idf-based autocomplete ordering.
+        word_counts: Counter[str] = Counter()
        for hit in results.hits:
            # hits are (score, doc_address) tuples
            doc_address = hit[1] if len(hit) == 2 else hit[0]
@@ -533,27 +536,16 @@ class TantivyBackend:
            stored_doc = searcher.doc(doc_address)
            doc_dict = stored_doc.to_dict()
            if "autocomplete_word" in doc_dict:
-                for word in doc_dict["autocomplete_word"]:
-                    words.add(word)
+                word_counts.update(doc_dict["autocomplete_word"])

-        # Sort and find matches
-        sorted_words = sorted(words)
+        # Filter to prefix matches, then sort by document frequency descending
+        # so the most-used matching word comes first.
+        matches = sorted(
+            (w for w in word_counts if w.startswith(normalized_term)),
+            key=lambda w: -word_counts[w],
+        )

-        # Use binary search to find starting position
-        start_idx = bisect.bisect_left(sorted_words, normalized_term)
-
-        # Collect matching words
-        matches = []
-        for i in range(start_idx, len(sorted_words)):
-            word = sorted_words[i]
-            if word.startswith(normalized_term):
-                matches.append(word)
-                if len(matches) >= limit:
-                    break
-            else:
-                break
-
-        return matches
+        return matches[:limit]

    def more_like_this(
        self,
@@ -137,6 +137,30 @@ class TestAutocomplete:
        results = backend.autocomplete("micro", limit=10)
        assert "microsoft" in results

+    def test_results_ordered_by_document_frequency(self, backend: TantivyBackend):
+        """Most-used prefix match should rank first."""
+        # "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should
+        # return "payment" before "payslip".
+        for i, (title, checksum) in enumerate(
+            [
+                ("payment invoice", "AF1"),
+                ("payment receipt", "AF2"),
+                ("payment confirmation", "AF3"),
+                ("payslip march", "AF4"),
+            ],
+            start=41,
+        ):
+            doc = Document.objects.create(
+                title=title,
+                content="details",
+                checksum=checksum,
+                pk=i,
+            )
+            backend.add_or_update(doc)
+
+        results = backend.autocomplete("pay", limit=10)
+        assert results.index("payment") < results.index("payslip")
+

 class TestMoreLikeThis:
    """Test more like this functionality."""