diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 8cb7e4d3b..0899ea73c 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -598,6 +598,68 @@ class TantivyBackend: query=query, ) + def search_ids( + self, + query: str, + user: AbstractBaseUser | None, + *, + search_mode: SearchMode = SearchMode.QUERY, + limit: int = 10000, + ) -> list[int]: + """ + Return document IDs matching a query — no highlights, no stored doc fetches. + + This is the lightweight companion to search(). Use it when you need the + full set of matching IDs (e.g. for ``selection_data``) but don't need + scores, ranks, or highlights. + + Args: + query: User's search query + user: User for permission filtering (None for superuser/no filtering) + search_mode: Query parsing mode (QUERY, TEXT, or TITLE) + limit: Maximum number of IDs to return + + Returns: + List of document IDs in relevance order + """ + self._ensure_open() + tz = get_current_timezone() + if search_mode is SearchMode.TEXT: + user_query = parse_simple_text_query(self._index, query) + elif search_mode is SearchMode.TITLE: + user_query = parse_simple_title_query(self._index, query) + else: + user_query = parse_user_query(self._index, query, tz) + + if user is not None: + permission_filter = build_permission_filter(self._schema, user) + final_query = tantivy.Query.boolean_query( + [ + (tantivy.Occur.Must, user_query), + (tantivy.Occur.Must, permission_filter), + ], + ) + else: + final_query = user_query + + searcher = self._index.searcher() + results = searcher.search(final_query, limit=limit) + + all_hits = [(hit[1], hit[0]) for hit in results.hits] + + # Normalize scores and apply threshold (same logic as search()) + if all_hits: + max_score = max(hit[1] for hit in all_hits) or 1.0 + all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits] + + threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD + if threshold is not None: + all_hits = [hit for hit in all_hits if hit[1] >= threshold] + + return [ + searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, _score in all_hits + ] + def autocomplete( self, term: str, @@ -761,6 +823,74 @@ class TantivyBackend: query=f"more_like:{doc_id}", ) + def more_like_this_ids( + self, + doc_id: int, + user: AbstractBaseUser | None, + *, + limit: int = 10000, + ) -> list[int]: + """ + Return IDs of documents similar to the given document — no highlights. + + Lightweight companion to more_like_this(). The original document is + excluded from results. + + Args: + doc_id: Primary key of the reference document + user: User for permission filtering (None for no filtering) + limit: Maximum number of IDs to return + + Returns: + List of similar document IDs (excluding the original) + """ + self._ensure_open() + searcher = self._index.searcher() + + id_query = tantivy.Query.range_query( + self._schema, + "id", + tantivy.FieldType.Unsigned, + doc_id, + doc_id, + ) + results = searcher.search(id_query, limit=1) + + if not results.hits: + return [] + + doc_address = results.hits[0][1] + mlt_query = tantivy.Query.more_like_this_query( + doc_address, + min_doc_frequency=1, + max_doc_frequency=None, + min_term_frequency=1, + max_query_terms=12, + min_word_length=None, + max_word_length=None, + boost_factor=None, + ) + + if user is not None: + permission_filter = build_permission_filter(self._schema, user) + final_query = tantivy.Query.boolean_query( + [ + (tantivy.Occur.Must, mlt_query), + (tantivy.Occur.Must, permission_filter), + ], + ) + else: + final_query = mlt_query + + results = searcher.search(final_query, limit=limit) + + ids = [] + for _score, doc_address in results.hits: + result_doc_id = searcher.doc(doc_address).to_dict()["id"][0] + if result_doc_id != doc_id: + ids.append(result_doc_id) + return ids + def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch: """ Get a batch context manager for bulk index operations. diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index 4928d402b..d5b8a0122 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -509,6 +509,82 @@ class TestSearch: assert "content" in hit["highlights"] +class TestSearchIds: + """Test lightweight ID-only search.""" + + def test_returns_matching_ids(self, backend: TantivyBackend): + """search_ids must return IDs of all matching documents.""" + docs = [] + for i in range(5): + doc = Document.objects.create( + title=f"findable doc {i}", + content="common keyword", + checksum=f"SI{i}", + ) + backend.add_or_update(doc) + docs.append(doc) + other = Document.objects.create( + title="unrelated", + content="nothing here", + checksum="SI_other", + ) + backend.add_or_update(other) + + ids = backend.search_ids( + "common keyword", + user=None, + search_mode=SearchMode.QUERY, + ) + assert set(ids) == {d.pk for d in docs} + assert other.pk not in ids + + def test_respects_permission_filter(self, backend: TantivyBackend): + """search_ids must respect user permission filtering.""" + owner = User.objects.create_user("ids_owner") + other = User.objects.create_user("ids_other") + doc = Document.objects.create( + title="private doc", + content="secret keyword", + checksum="SIP1", + owner=owner, + ) + backend.add_or_update(doc) + + assert backend.search_ids( + "secret", + user=owner, + search_mode=SearchMode.QUERY, + ) == [doc.pk] + assert ( + backend.search_ids("secret", user=other, search_mode=SearchMode.QUERY) == [] + ) + + def test_respects_fuzzy_threshold(self, backend: TantivyBackend, settings): + """search_ids must apply the same fuzzy threshold as search().""" + doc = Document.objects.create( + title="threshold test", + content="unique term", + checksum="SIT1", + ) + backend.add_or_update(doc) + + settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 1.1 + ids = backend.search_ids("unique", user=None, search_mode=SearchMode.QUERY) + assert ids == [] + + def test_returns_ids_for_text_mode(self, backend: TantivyBackend): + """search_ids must work with TEXT search mode.""" + doc = Document.objects.create( + title="text mode doc", + content="findable phrase", + checksum="SIM1", + ) + backend.add_or_update(doc) + + ids = backend.search_ids("findable", user=None, search_mode=SearchMode.TEXT) + assert ids == [doc.pk] + + class TestRebuild: """Test index rebuilding functionality.""" @@ -622,6 +698,27 @@ class TestMoreLikeThis: assert results.hits == [] assert results.total == 0 + def test_more_like_this_ids_excludes_original(self, backend: TantivyBackend): + """more_like_this_ids must return IDs of similar documents, excluding the original.""" + doc1 = Document.objects.create( + title="Important document", + content="financial information report", + checksum="MLTI1", + pk=150, + ) + doc2 = Document.objects.create( + title="Another document", + content="financial information report", + checksum="MLTI2", + pk=151, + ) + backend.add_or_update(doc1) + backend.add_or_update(doc2) + + ids = backend.more_like_this_ids(doc_id=150, user=None) + assert 150 not in ids + assert 151 in ids + class TestSingleton: """Test get_backend() and reset_backend() singleton lifecycle."""