mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-07 16:48:54 +00:00
refactor: remove dead more_like_this() method from TantivyBackend
The method is no longer called anywhere in production code — all callers were migrated to more_like_this_ids() during the search pagination work. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -873,104 +873,6 @@ class TantivyBackend:
|
||||
|
||||
return matches[:limit]
|
||||
|
||||
def more_like_this(
|
||||
self,
|
||||
doc_id: int,
|
||||
user: AbstractBaseUser | None,
|
||||
page: int,
|
||||
page_size: int,
|
||||
) -> SearchResults:
|
||||
"""
|
||||
Find documents similar to the given document using content analysis.
|
||||
|
||||
Uses Tantivy's "more like this" query to find documents with similar
|
||||
content patterns. The original document is excluded from results.
|
||||
|
||||
Args:
|
||||
doc_id: Primary key of the reference document
|
||||
user: User for permission filtering (None for no filtering)
|
||||
page: Page number (1-indexed) for pagination
|
||||
page_size: Number of results per page
|
||||
|
||||
Returns:
|
||||
SearchResults with similar documents (excluding the original)
|
||||
"""
|
||||
self._ensure_open()
|
||||
searcher = self._index.searcher()
|
||||
|
||||
# First find the document address
|
||||
id_query = tantivy.Query.range_query(
|
||||
self._schema,
|
||||
"id",
|
||||
tantivy.FieldType.Unsigned,
|
||||
doc_id,
|
||||
doc_id,
|
||||
)
|
||||
results = searcher.search(id_query, limit=1)
|
||||
|
||||
if not results.hits:
|
||||
# Document not found
|
||||
return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}")
|
||||
|
||||
# Extract doc_address from (score, doc_address) tuple
|
||||
doc_address = results.hits[0][1]
|
||||
|
||||
# Build more like this query
|
||||
mlt_query = tantivy.Query.more_like_this_query(
|
||||
doc_address,
|
||||
min_doc_frequency=1,
|
||||
max_doc_frequency=None,
|
||||
min_term_frequency=1,
|
||||
max_query_terms=12,
|
||||
min_word_length=None,
|
||||
max_word_length=None,
|
||||
boost_factor=None,
|
||||
)
|
||||
|
||||
final_query = self._apply_permission_filter(mlt_query, user)
|
||||
|
||||
# Search
|
||||
offset = (page - 1) * page_size
|
||||
results = searcher.search(final_query, limit=offset + page_size)
|
||||
|
||||
total = results.count
|
||||
# Convert from (score, doc_address) to (doc_address, score)
|
||||
all_hits = [(hit[1], hit[0]) for hit in results.hits]
|
||||
|
||||
# Normalize scores
|
||||
if all_hits:
|
||||
max_score = max(hit[1] for hit in all_hits) or 1.0
|
||||
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
|
||||
|
||||
# Get page hits
|
||||
page_hits = all_hits[offset : offset + page_size]
|
||||
|
||||
# Build results
|
||||
hits: list[SearchHit] = []
|
||||
for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
|
||||
actual_doc = searcher.doc(doc_address)
|
||||
doc_dict = actual_doc.to_dict()
|
||||
result_doc_id = doc_dict["id"][0]
|
||||
|
||||
# Skip the original document
|
||||
if result_doc_id == doc_id:
|
||||
continue
|
||||
|
||||
hits.append(
|
||||
SearchHit(
|
||||
id=result_doc_id,
|
||||
score=score,
|
||||
rank=rank,
|
||||
highlights={}, # MLT doesn't generate highlights
|
||||
),
|
||||
)
|
||||
|
||||
return SearchResults(
|
||||
hits=hits,
|
||||
total=max(0, total - 1), # Subtract 1 for the original document
|
||||
query=f"more_like:{doc_id}",
|
||||
)
|
||||
|
||||
def more_like_this_ids(
|
||||
self,
|
||||
doc_id: int,
|
||||
|
||||
@@ -646,58 +646,6 @@ class TestAutocomplete:
|
||||
class TestMoreLikeThis:
|
||||
"""Test more like this functionality."""
|
||||
|
||||
def test_excludes_original(self, backend: TantivyBackend):
|
||||
"""More like this queries must exclude the reference document from results."""
|
||||
doc1 = Document.objects.create(
|
||||
title="Important document",
|
||||
content="financial information",
|
||||
checksum="MLT1",
|
||||
pk=50,
|
||||
)
|
||||
doc2 = Document.objects.create(
|
||||
title="Another document",
|
||||
content="financial report",
|
||||
checksum="MLT2",
|
||||
pk=51,
|
||||
)
|
||||
backend.add_or_update(doc1)
|
||||
backend.add_or_update(doc2)
|
||||
|
||||
results = backend.more_like_this(doc_id=50, user=None, page=1, page_size=10)
|
||||
returned_ids = [hit["id"] for hit in results.hits]
|
||||
assert 50 not in returned_ids # Original document excluded
|
||||
|
||||
def test_with_user_applies_permission_filter(self, backend: TantivyBackend):
|
||||
"""more_like_this with a user must exclude documents that user cannot see."""
|
||||
viewer = User.objects.create_user("mlt_viewer")
|
||||
other = User.objects.create_user("mlt_other")
|
||||
public_doc = Document.objects.create(
|
||||
title="Public financial document",
|
||||
content="quarterly financial analysis report figures",
|
||||
checksum="MLT3",
|
||||
pk=52,
|
||||
)
|
||||
private_doc = Document.objects.create(
|
||||
title="Private financial document",
|
||||
content="quarterly financial analysis report figures",
|
||||
checksum="MLT4",
|
||||
pk=53,
|
||||
owner=other,
|
||||
)
|
||||
backend.add_or_update(public_doc)
|
||||
backend.add_or_update(private_doc)
|
||||
|
||||
results = backend.more_like_this(doc_id=52, user=viewer, page=1, page_size=10)
|
||||
returned_ids = [hit["id"] for hit in results.hits]
|
||||
# private_doc is owned by other, so viewer cannot see it
|
||||
assert 53 not in returned_ids
|
||||
|
||||
def test_document_not_in_index_returns_empty(self, backend: TantivyBackend):
|
||||
"""more_like_this for a doc_id absent from the index must return empty results."""
|
||||
results = backend.more_like_this(doc_id=9999, user=None, page=1, page_size=10)
|
||||
assert results.hits == []
|
||||
assert results.total == 0
|
||||
|
||||
def test_more_like_this_ids_excludes_original(self, backend: TantivyBackend):
|
||||
"""more_like_this_ids must return IDs of similar documents, excluding the original."""
|
||||
doc1 = Document.objects.create(
|
||||
|
||||
Reference in New Issue
Block a user