diff --git a/docs/api.md b/docs/api.md index 23fd2dd05..414fe16da 100644 --- a/docs/api.md +++ b/docs/api.md @@ -167,8 +167,8 @@ Query parameters: - `term`: The incomplete term. - `limit`: Amount of results. Defaults to 10. -Results are ordered alphabetically by prefix match. The first result is -the lexicographically first word in the index that starts with the given term. +Results are ordered by how many of the user's visible documents contain +each matching word. The first result is the word that appears in the most documents. ```json ["term1", "term3", "term6", "term4"] diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index b9be04a0a..e3722a017 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -1,9 +1,9 @@ from __future__ import annotations -import bisect import logging import threading import unicodedata +from collections import Counter from dataclasses import dataclass from datetime import UTC from datetime import datetime @@ -524,8 +524,11 @@ class TantivyBackend: results = searcher.search(base_query, limit=10000) - # Collect all autocomplete words - words = set() + # Count how many visible documents each word appears in. + # Using Counter (not set) preserves per-word document frequency so + # we can rank suggestions by how commonly they occur — the same + # signal Whoosh used for Tf/Idf-based autocomplete ordering. + word_counts: Counter[str] = Counter() for hit in results.hits: # hits are (score, doc_address) tuples doc_address = hit[1] if len(hit) == 2 else hit[0] @@ -533,27 +536,16 @@ class TantivyBackend: stored_doc = searcher.doc(doc_address) doc_dict = stored_doc.to_dict() if "autocomplete_word" in doc_dict: - for word in doc_dict["autocomplete_word"]: - words.add(word) + word_counts.update(doc_dict["autocomplete_word"]) - # Sort and find matches - sorted_words = sorted(words) + # Filter to prefix matches, then sort by document frequency descending + # so the most-used matching word comes first. + matches = sorted( + (w for w in word_counts if w.startswith(normalized_term)), + key=lambda w: -word_counts[w], + ) - # Use binary search to find starting position - start_idx = bisect.bisect_left(sorted_words, normalized_term) - - # Collect matching words - matches = [] - for i in range(start_idx, len(sorted_words)): - word = sorted_words[i] - if word.startswith(normalized_term): - matches.append(word) - if len(matches) >= limit: - break - else: - break - - return matches + return matches[:limit] def more_like_this( self, diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index bc4d91357..23adfda85 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -137,6 +137,30 @@ class TestAutocomplete: results = backend.autocomplete("micro", limit=10) assert "microsoft" in results + def test_results_ordered_by_document_frequency(self, backend: TantivyBackend): + """Most-used prefix match should rank first.""" + # "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should + # return "payment" before "payslip". + for i, (title, checksum) in enumerate( + [ + ("payment invoice", "AF1"), + ("payment receipt", "AF2"), + ("payment confirmation", "AF3"), + ("payslip march", "AF4"), + ], + start=41, + ): + doc = Document.objects.create( + title=title, + content="details", + checksum=checksum, + pk=i, + ) + backend.add_or_update(doc) + + results = backend.autocomplete("pay", limit=10) + assert results.index("payment") < results.index("payslip") + class TestMoreLikeThis: """Test more like this functionality."""