feat: add search_ids() and more_like_this_ids() lightweight methods

search_ids() returns only document IDs matching a query — no highlights,
no SearchHit objects. more_like_this_ids() does the same for MLT queries.
These provide lightweight paths when only IDs are needed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton Holmes
2026-04-03 15:21:52 -07:00
parent 0d915c58a4
commit 6cf01dd383
2 changed files with 227 additions and 0 deletions

View File

@@ -598,6 +598,68 @@ class TantivyBackend:
query=query,
)
def search_ids(
self,
query: str,
user: AbstractBaseUser | None,
*,
search_mode: SearchMode = SearchMode.QUERY,
limit: int = 10000,
) -> list[int]:
"""
Return document IDs matching a query — no highlights, no stored doc fetches.
This is the lightweight companion to search(). Use it when you need the
full set of matching IDs (e.g. for ``selection_data``) but don't need
scores, ranks, or highlights.
Args:
query: User's search query
user: User for permission filtering (None for superuser/no filtering)
search_mode: Query parsing mode (QUERY, TEXT, or TITLE)
limit: Maximum number of IDs to return
Returns:
List of document IDs in relevance order
"""
self._ensure_open()
tz = get_current_timezone()
if search_mode is SearchMode.TEXT:
user_query = parse_simple_text_query(self._index, query)
elif search_mode is SearchMode.TITLE:
user_query = parse_simple_title_query(self._index, query)
else:
user_query = parse_user_query(self._index, query, tz)
if user is not None:
permission_filter = build_permission_filter(self._schema, user)
final_query = tantivy.Query.boolean_query(
[
(tantivy.Occur.Must, user_query),
(tantivy.Occur.Must, permission_filter),
],
)
else:
final_query = user_query
searcher = self._index.searcher()
results = searcher.search(final_query, limit=limit)
all_hits = [(hit[1], hit[0]) for hit in results.hits]
# Normalize scores and apply threshold (same logic as search())
if all_hits:
max_score = max(hit[1] for hit in all_hits) or 1.0
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
if threshold is not None:
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
return [
searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, _score in all_hits
]
def autocomplete(
self,
term: str,
@@ -761,6 +823,74 @@ class TantivyBackend:
query=f"more_like:{doc_id}",
)
def more_like_this_ids(
self,
doc_id: int,
user: AbstractBaseUser | None,
*,
limit: int = 10000,
) -> list[int]:
"""
Return IDs of documents similar to the given document — no highlights.
Lightweight companion to more_like_this(). The original document is
excluded from results.
Args:
doc_id: Primary key of the reference document
user: User for permission filtering (None for no filtering)
limit: Maximum number of IDs to return
Returns:
List of similar document IDs (excluding the original)
"""
self._ensure_open()
searcher = self._index.searcher()
id_query = tantivy.Query.range_query(
self._schema,
"id",
tantivy.FieldType.Unsigned,
doc_id,
doc_id,
)
results = searcher.search(id_query, limit=1)
if not results.hits:
return []
doc_address = results.hits[0][1]
mlt_query = tantivy.Query.more_like_this_query(
doc_address,
min_doc_frequency=1,
max_doc_frequency=None,
min_term_frequency=1,
max_query_terms=12,
min_word_length=None,
max_word_length=None,
boost_factor=None,
)
if user is not None:
permission_filter = build_permission_filter(self._schema, user)
final_query = tantivy.Query.boolean_query(
[
(tantivy.Occur.Must, mlt_query),
(tantivy.Occur.Must, permission_filter),
],
)
else:
final_query = mlt_query
results = searcher.search(final_query, limit=limit)
ids = []
for _score, doc_address in results.hits:
result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
if result_doc_id != doc_id:
ids.append(result_doc_id)
return ids
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
"""
Get a batch context manager for bulk index operations.

View File

@@ -509,6 +509,82 @@ class TestSearch:
assert "content" in hit["highlights"]
class TestSearchIds:
"""Test lightweight ID-only search."""
def test_returns_matching_ids(self, backend: TantivyBackend):
"""search_ids must return IDs of all matching documents."""
docs = []
for i in range(5):
doc = Document.objects.create(
title=f"findable doc {i}",
content="common keyword",
checksum=f"SI{i}",
)
backend.add_or_update(doc)
docs.append(doc)
other = Document.objects.create(
title="unrelated",
content="nothing here",
checksum="SI_other",
)
backend.add_or_update(other)
ids = backend.search_ids(
"common keyword",
user=None,
search_mode=SearchMode.QUERY,
)
assert set(ids) == {d.pk for d in docs}
assert other.pk not in ids
def test_respects_permission_filter(self, backend: TantivyBackend):
"""search_ids must respect user permission filtering."""
owner = User.objects.create_user("ids_owner")
other = User.objects.create_user("ids_other")
doc = Document.objects.create(
title="private doc",
content="secret keyword",
checksum="SIP1",
owner=owner,
)
backend.add_or_update(doc)
assert backend.search_ids(
"secret",
user=owner,
search_mode=SearchMode.QUERY,
) == [doc.pk]
assert (
backend.search_ids("secret", user=other, search_mode=SearchMode.QUERY) == []
)
def test_respects_fuzzy_threshold(self, backend: TantivyBackend, settings):
"""search_ids must apply the same fuzzy threshold as search()."""
doc = Document.objects.create(
title="threshold test",
content="unique term",
checksum="SIT1",
)
backend.add_or_update(doc)
settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 1.1
ids = backend.search_ids("unique", user=None, search_mode=SearchMode.QUERY)
assert ids == []
def test_returns_ids_for_text_mode(self, backend: TantivyBackend):
"""search_ids must work with TEXT search mode."""
doc = Document.objects.create(
title="text mode doc",
content="findable phrase",
checksum="SIM1",
)
backend.add_or_update(doc)
ids = backend.search_ids("findable", user=None, search_mode=SearchMode.TEXT)
assert ids == [doc.pk]
class TestRebuild:
"""Test index rebuilding functionality."""
@@ -622,6 +698,27 @@ class TestMoreLikeThis:
assert results.hits == []
assert results.total == 0
def test_more_like_this_ids_excludes_original(self, backend: TantivyBackend):
"""more_like_this_ids must return IDs of similar documents, excluding the original."""
doc1 = Document.objects.create(
title="Important document",
content="financial information report",
checksum="MLTI1",
pk=150,
)
doc2 = Document.objects.create(
title="Another document",
content="financial information report",
checksum="MLTI2",
pk=151,
)
backend.add_or_update(doc1)
backend.add_or_update(doc2)
ids = backend.more_like_this_ids(doc_id=150, user=None)
assert 150 not in ids
assert 151 in ids
class TestSingleton:
"""Test get_backend() and reset_backend() singleton lifecycle."""