From 534fcfde6b7cdc7112ffb3b44e88f927a062b330 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Mon, 6 Apr 2026 13:10:58 -0700 Subject: [PATCH] refactor: remove dead more_like_this() method from TantivyBackend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The method is no longer called anywhere in production code — all callers were migrated to more_like_this_ids() during the search pagination work. Co-Authored-By: Claude Opus 4.6 --- src/documents/search/_backend.py | 98 ---------------------- src/documents/tests/search/test_backend.py | 52 ------------ 2 files changed, 150 deletions(-) diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 49fdb4111..ab2f5a104 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -873,104 +873,6 @@ class TantivyBackend: return matches[:limit] - def more_like_this( - self, - doc_id: int, - user: AbstractBaseUser | None, - page: int, - page_size: int, - ) -> SearchResults: - """ - Find documents similar to the given document using content analysis. - - Uses Tantivy's "more like this" query to find documents with similar - content patterns. The original document is excluded from results. - - Args: - doc_id: Primary key of the reference document - user: User for permission filtering (None for no filtering) - page: Page number (1-indexed) for pagination - page_size: Number of results per page - - Returns: - SearchResults with similar documents (excluding the original) - """ - self._ensure_open() - searcher = self._index.searcher() - - # First find the document address - id_query = tantivy.Query.range_query( - self._schema, - "id", - tantivy.FieldType.Unsigned, - doc_id, - doc_id, - ) - results = searcher.search(id_query, limit=1) - - if not results.hits: - # Document not found - return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}") - - # Extract doc_address from (score, doc_address) tuple - doc_address = results.hits[0][1] - - # Build more like this query - mlt_query = tantivy.Query.more_like_this_query( - doc_address, - min_doc_frequency=1, - max_doc_frequency=None, - min_term_frequency=1, - max_query_terms=12, - min_word_length=None, - max_word_length=None, - boost_factor=None, - ) - - final_query = self._apply_permission_filter(mlt_query, user) - - # Search - offset = (page - 1) * page_size - results = searcher.search(final_query, limit=offset + page_size) - - total = results.count - # Convert from (score, doc_address) to (doc_address, score) - all_hits = [(hit[1], hit[0]) for hit in results.hits] - - # Normalize scores - if all_hits: - max_score = max(hit[1] for hit in all_hits) or 1.0 - all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits] - - # Get page hits - page_hits = all_hits[offset : offset + page_size] - - # Build results - hits: list[SearchHit] = [] - for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1): - actual_doc = searcher.doc(doc_address) - doc_dict = actual_doc.to_dict() - result_doc_id = doc_dict["id"][0] - - # Skip the original document - if result_doc_id == doc_id: - continue - - hits.append( - SearchHit( - id=result_doc_id, - score=score, - rank=rank, - highlights={}, # MLT doesn't generate highlights - ), - ) - - return SearchResults( - hits=hits, - total=max(0, total - 1), # Subtract 1 for the original document - query=f"more_like:{doc_id}", - ) - def more_like_this_ids( self, doc_id: int, diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index d5b8a0122..71099c8c9 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -646,58 +646,6 @@ class TestAutocomplete: class TestMoreLikeThis: """Test more like this functionality.""" - def test_excludes_original(self, backend: TantivyBackend): - """More like this queries must exclude the reference document from results.""" - doc1 = Document.objects.create( - title="Important document", - content="financial information", - checksum="MLT1", - pk=50, - ) - doc2 = Document.objects.create( - title="Another document", - content="financial report", - checksum="MLT2", - pk=51, - ) - backend.add_or_update(doc1) - backend.add_or_update(doc2) - - results = backend.more_like_this(doc_id=50, user=None, page=1, page_size=10) - returned_ids = [hit["id"] for hit in results.hits] - assert 50 not in returned_ids # Original document excluded - - def test_with_user_applies_permission_filter(self, backend: TantivyBackend): - """more_like_this with a user must exclude documents that user cannot see.""" - viewer = User.objects.create_user("mlt_viewer") - other = User.objects.create_user("mlt_other") - public_doc = Document.objects.create( - title="Public financial document", - content="quarterly financial analysis report figures", - checksum="MLT3", - pk=52, - ) - private_doc = Document.objects.create( - title="Private financial document", - content="quarterly financial analysis report figures", - checksum="MLT4", - pk=53, - owner=other, - ) - backend.add_or_update(public_doc) - backend.add_or_update(private_doc) - - results = backend.more_like_this(doc_id=52, user=viewer, page=1, page_size=10) - returned_ids = [hit["id"] for hit in results.hits] - # private_doc is owned by other, so viewer cannot see it - assert 53 not in returned_ids - - def test_document_not_in_index_returns_empty(self, backend: TantivyBackend): - """more_like_this for a doc_id absent from the index must return empty results.""" - results = backend.more_like_this(doc_id=9999, user=None, page=1, page_size=10) - assert results.hits == [] - assert results.total == 0 - def test_more_like_this_ids_excludes_original(self, backend: TantivyBackend): """more_like_this_ids must return IDs of similar documents, excluding the original.""" doc1 = Document.objects.create(