refactor: remove dead more_like_this() method from TantivyBackend

The method is no longer called anywhere in production code — all callers
were migrated to more_like_this_ids() during the search pagination work.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton Holmes
2026-04-06 13:10:58 -07:00
parent 0b90f15602
commit 534fcfde6b
2 changed files with 0 additions and 150 deletions

View File

@@ -873,104 +873,6 @@ class TantivyBackend:
return matches[:limit]
def more_like_this(
self,
doc_id: int,
user: AbstractBaseUser | None,
page: int,
page_size: int,
) -> SearchResults:
"""
Find documents similar to the given document using content analysis.
Uses Tantivy's "more like this" query to find documents with similar
content patterns. The original document is excluded from results.
Args:
doc_id: Primary key of the reference document
user: User for permission filtering (None for no filtering)
page: Page number (1-indexed) for pagination
page_size: Number of results per page
Returns:
SearchResults with similar documents (excluding the original)
"""
self._ensure_open()
searcher = self._index.searcher()
# First find the document address
id_query = tantivy.Query.range_query(
self._schema,
"id",
tantivy.FieldType.Unsigned,
doc_id,
doc_id,
)
results = searcher.search(id_query, limit=1)
if not results.hits:
# Document not found
return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}")
# Extract doc_address from (score, doc_address) tuple
doc_address = results.hits[0][1]
# Build more like this query
mlt_query = tantivy.Query.more_like_this_query(
doc_address,
min_doc_frequency=1,
max_doc_frequency=None,
min_term_frequency=1,
max_query_terms=12,
min_word_length=None,
max_word_length=None,
boost_factor=None,
)
final_query = self._apply_permission_filter(mlt_query, user)
# Search
offset = (page - 1) * page_size
results = searcher.search(final_query, limit=offset + page_size)
total = results.count
# Convert from (score, doc_address) to (doc_address, score)
all_hits = [(hit[1], hit[0]) for hit in results.hits]
# Normalize scores
if all_hits:
max_score = max(hit[1] for hit in all_hits) or 1.0
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
# Get page hits
page_hits = all_hits[offset : offset + page_size]
# Build results
hits: list[SearchHit] = []
for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
actual_doc = searcher.doc(doc_address)
doc_dict = actual_doc.to_dict()
result_doc_id = doc_dict["id"][0]
# Skip the original document
if result_doc_id == doc_id:
continue
hits.append(
SearchHit(
id=result_doc_id,
score=score,
rank=rank,
highlights={}, # MLT doesn't generate highlights
),
)
return SearchResults(
hits=hits,
total=max(0, total - 1), # Subtract 1 for the original document
query=f"more_like:{doc_id}",
)
def more_like_this_ids(
self,
doc_id: int,

View File

@@ -646,58 +646,6 @@ class TestAutocomplete:
class TestMoreLikeThis:
"""Test more like this functionality."""
def test_excludes_original(self, backend: TantivyBackend):
"""More like this queries must exclude the reference document from results."""
doc1 = Document.objects.create(
title="Important document",
content="financial information",
checksum="MLT1",
pk=50,
)
doc2 = Document.objects.create(
title="Another document",
content="financial report",
checksum="MLT2",
pk=51,
)
backend.add_or_update(doc1)
backend.add_or_update(doc2)
results = backend.more_like_this(doc_id=50, user=None, page=1, page_size=10)
returned_ids = [hit["id"] for hit in results.hits]
assert 50 not in returned_ids # Original document excluded
def test_with_user_applies_permission_filter(self, backend: TantivyBackend):
"""more_like_this with a user must exclude documents that user cannot see."""
viewer = User.objects.create_user("mlt_viewer")
other = User.objects.create_user("mlt_other")
public_doc = Document.objects.create(
title="Public financial document",
content="quarterly financial analysis report figures",
checksum="MLT3",
pk=52,
)
private_doc = Document.objects.create(
title="Private financial document",
content="quarterly financial analysis report figures",
checksum="MLT4",
pk=53,
owner=other,
)
backend.add_or_update(public_doc)
backend.add_or_update(private_doc)
results = backend.more_like_this(doc_id=52, user=viewer, page=1, page_size=10)
returned_ids = [hit["id"] for hit in results.hits]
# private_doc is owned by other, so viewer cannot see it
assert 53 not in returned_ids
def test_document_not_in_index_returns_empty(self, backend: TantivyBackend):
"""more_like_this for a doc_id absent from the index must return empty results."""
results = backend.more_like_this(doc_id=9999, user=None, page=1, page_size=10)
assert results.hits == []
assert results.total == 0
def test_more_like_this_ids_excludes_original(self, backend: TantivyBackend):
"""more_like_this_ids must return IDs of similar documents, excluding the original."""
doc1 = Document.objects.create(