mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-14 03:58:52 +00:00
feat: add search_ids() and more_like_this_ids() lightweight methods
search_ids() returns only document IDs matching a query — no highlights, no SearchHit objects. more_like_this_ids() does the same for MLT queries. These provide lightweight paths when only IDs are needed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -598,6 +598,68 @@ class TantivyBackend:
|
||||
query=query,
|
||||
)
|
||||
|
||||
def search_ids(
|
||||
self,
|
||||
query: str,
|
||||
user: AbstractBaseUser | None,
|
||||
*,
|
||||
search_mode: SearchMode = SearchMode.QUERY,
|
||||
limit: int = 10000,
|
||||
) -> list[int]:
|
||||
"""
|
||||
Return document IDs matching a query — no highlights, no stored doc fetches.
|
||||
|
||||
This is the lightweight companion to search(). Use it when you need the
|
||||
full set of matching IDs (e.g. for ``selection_data``) but don't need
|
||||
scores, ranks, or highlights.
|
||||
|
||||
Args:
|
||||
query: User's search query
|
||||
user: User for permission filtering (None for superuser/no filtering)
|
||||
search_mode: Query parsing mode (QUERY, TEXT, or TITLE)
|
||||
limit: Maximum number of IDs to return
|
||||
|
||||
Returns:
|
||||
List of document IDs in relevance order
|
||||
"""
|
||||
self._ensure_open()
|
||||
tz = get_current_timezone()
|
||||
if search_mode is SearchMode.TEXT:
|
||||
user_query = parse_simple_text_query(self._index, query)
|
||||
elif search_mode is SearchMode.TITLE:
|
||||
user_query = parse_simple_title_query(self._index, query)
|
||||
else:
|
||||
user_query = parse_user_query(self._index, query, tz)
|
||||
|
||||
if user is not None:
|
||||
permission_filter = build_permission_filter(self._schema, user)
|
||||
final_query = tantivy.Query.boolean_query(
|
||||
[
|
||||
(tantivy.Occur.Must, user_query),
|
||||
(tantivy.Occur.Must, permission_filter),
|
||||
],
|
||||
)
|
||||
else:
|
||||
final_query = user_query
|
||||
|
||||
searcher = self._index.searcher()
|
||||
results = searcher.search(final_query, limit=limit)
|
||||
|
||||
all_hits = [(hit[1], hit[0]) for hit in results.hits]
|
||||
|
||||
# Normalize scores and apply threshold (same logic as search())
|
||||
if all_hits:
|
||||
max_score = max(hit[1] for hit in all_hits) or 1.0
|
||||
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
|
||||
|
||||
threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
|
||||
if threshold is not None:
|
||||
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
|
||||
|
||||
return [
|
||||
searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, _score in all_hits
|
||||
]
|
||||
|
||||
def autocomplete(
|
||||
self,
|
||||
term: str,
|
||||
@@ -761,6 +823,74 @@ class TantivyBackend:
|
||||
query=f"more_like:{doc_id}",
|
||||
)
|
||||
|
||||
def more_like_this_ids(
|
||||
self,
|
||||
doc_id: int,
|
||||
user: AbstractBaseUser | None,
|
||||
*,
|
||||
limit: int = 10000,
|
||||
) -> list[int]:
|
||||
"""
|
||||
Return IDs of documents similar to the given document — no highlights.
|
||||
|
||||
Lightweight companion to more_like_this(). The original document is
|
||||
excluded from results.
|
||||
|
||||
Args:
|
||||
doc_id: Primary key of the reference document
|
||||
user: User for permission filtering (None for no filtering)
|
||||
limit: Maximum number of IDs to return
|
||||
|
||||
Returns:
|
||||
List of similar document IDs (excluding the original)
|
||||
"""
|
||||
self._ensure_open()
|
||||
searcher = self._index.searcher()
|
||||
|
||||
id_query = tantivy.Query.range_query(
|
||||
self._schema,
|
||||
"id",
|
||||
tantivy.FieldType.Unsigned,
|
||||
doc_id,
|
||||
doc_id,
|
||||
)
|
||||
results = searcher.search(id_query, limit=1)
|
||||
|
||||
if not results.hits:
|
||||
return []
|
||||
|
||||
doc_address = results.hits[0][1]
|
||||
mlt_query = tantivy.Query.more_like_this_query(
|
||||
doc_address,
|
||||
min_doc_frequency=1,
|
||||
max_doc_frequency=None,
|
||||
min_term_frequency=1,
|
||||
max_query_terms=12,
|
||||
min_word_length=None,
|
||||
max_word_length=None,
|
||||
boost_factor=None,
|
||||
)
|
||||
|
||||
if user is not None:
|
||||
permission_filter = build_permission_filter(self._schema, user)
|
||||
final_query = tantivy.Query.boolean_query(
|
||||
[
|
||||
(tantivy.Occur.Must, mlt_query),
|
||||
(tantivy.Occur.Must, permission_filter),
|
||||
],
|
||||
)
|
||||
else:
|
||||
final_query = mlt_query
|
||||
|
||||
results = searcher.search(final_query, limit=limit)
|
||||
|
||||
ids = []
|
||||
for _score, doc_address in results.hits:
|
||||
result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
|
||||
if result_doc_id != doc_id:
|
||||
ids.append(result_doc_id)
|
||||
return ids
|
||||
|
||||
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
|
||||
"""
|
||||
Get a batch context manager for bulk index operations.
|
||||
|
||||
@@ -509,6 +509,82 @@ class TestSearch:
|
||||
assert "content" in hit["highlights"]
|
||||
|
||||
|
||||
class TestSearchIds:
|
||||
"""Test lightweight ID-only search."""
|
||||
|
||||
def test_returns_matching_ids(self, backend: TantivyBackend):
|
||||
"""search_ids must return IDs of all matching documents."""
|
||||
docs = []
|
||||
for i in range(5):
|
||||
doc = Document.objects.create(
|
||||
title=f"findable doc {i}",
|
||||
content="common keyword",
|
||||
checksum=f"SI{i}",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
docs.append(doc)
|
||||
other = Document.objects.create(
|
||||
title="unrelated",
|
||||
content="nothing here",
|
||||
checksum="SI_other",
|
||||
)
|
||||
backend.add_or_update(other)
|
||||
|
||||
ids = backend.search_ids(
|
||||
"common keyword",
|
||||
user=None,
|
||||
search_mode=SearchMode.QUERY,
|
||||
)
|
||||
assert set(ids) == {d.pk for d in docs}
|
||||
assert other.pk not in ids
|
||||
|
||||
def test_respects_permission_filter(self, backend: TantivyBackend):
|
||||
"""search_ids must respect user permission filtering."""
|
||||
owner = User.objects.create_user("ids_owner")
|
||||
other = User.objects.create_user("ids_other")
|
||||
doc = Document.objects.create(
|
||||
title="private doc",
|
||||
content="secret keyword",
|
||||
checksum="SIP1",
|
||||
owner=owner,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
assert backend.search_ids(
|
||||
"secret",
|
||||
user=owner,
|
||||
search_mode=SearchMode.QUERY,
|
||||
) == [doc.pk]
|
||||
assert (
|
||||
backend.search_ids("secret", user=other, search_mode=SearchMode.QUERY) == []
|
||||
)
|
||||
|
||||
def test_respects_fuzzy_threshold(self, backend: TantivyBackend, settings):
|
||||
"""search_ids must apply the same fuzzy threshold as search()."""
|
||||
doc = Document.objects.create(
|
||||
title="threshold test",
|
||||
content="unique term",
|
||||
checksum="SIT1",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 1.1
|
||||
ids = backend.search_ids("unique", user=None, search_mode=SearchMode.QUERY)
|
||||
assert ids == []
|
||||
|
||||
def test_returns_ids_for_text_mode(self, backend: TantivyBackend):
|
||||
"""search_ids must work with TEXT search mode."""
|
||||
doc = Document.objects.create(
|
||||
title="text mode doc",
|
||||
content="findable phrase",
|
||||
checksum="SIM1",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
ids = backend.search_ids("findable", user=None, search_mode=SearchMode.TEXT)
|
||||
assert ids == [doc.pk]
|
||||
|
||||
|
||||
class TestRebuild:
|
||||
"""Test index rebuilding functionality."""
|
||||
|
||||
@@ -622,6 +698,27 @@ class TestMoreLikeThis:
|
||||
assert results.hits == []
|
||||
assert results.total == 0
|
||||
|
||||
def test_more_like_this_ids_excludes_original(self, backend: TantivyBackend):
|
||||
"""more_like_this_ids must return IDs of similar documents, excluding the original."""
|
||||
doc1 = Document.objects.create(
|
||||
title="Important document",
|
||||
content="financial information report",
|
||||
checksum="MLTI1",
|
||||
pk=150,
|
||||
)
|
||||
doc2 = Document.objects.create(
|
||||
title="Another document",
|
||||
content="financial information report",
|
||||
checksum="MLTI2",
|
||||
pk=151,
|
||||
)
|
||||
backend.add_or_update(doc1)
|
||||
backend.add_or_update(doc2)
|
||||
|
||||
ids = backend.more_like_this_ids(doc_id=150, user=None)
|
||||
assert 150 not in ids
|
||||
assert 151 in ids
|
||||
|
||||
|
||||
class TestSingleton:
|
||||
"""Test get_backend() and reset_backend() singleton lifecycle."""
|
||||
|
||||
Reference in New Issue
Block a user