feat: replace 10000 overfetch with search_ids + page-only highlights

Use search_ids() for the full set of matching IDs (lightweight ints,
no arbitrary cap) and highlight_hits() for just the displayed page.
TantivyRelevanceList now holds ordered IDs for count/selection_data
and a small page of rich SearchHit dicts for serialization.

Removes the hardcoded 10000 limit that silently truncated results
for large collections. Memory usage down ~10% on sorted/paginated
search paths at 200 docs, with larger gains expected at scale.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton Holmes
2026-04-05 12:54:47 -07:00
parent 7c50e0077c
commit 610ba27891
4 changed files with 217 additions and 77 deletions
+2
View File
@@ -1,3 +1,4 @@
from documents.search._backend import SearchHit
from documents.search._backend import SearchIndexLockError
from documents.search._backend import SearchMode
from documents.search._backend import SearchResults
@@ -10,6 +11,7 @@ from documents.search._schema import needs_rebuild
from documents.search._schema import wipe_index
__all__ = [
"SearchHit",
"SearchIndexLockError",
"SearchMode",
"SearchResults",
+166 -32
View File
@@ -106,27 +106,51 @@ class SearchResults:
class TantivyRelevanceList:
"""
DRF-compatible list wrapper for Tantivy search hits.
DRF-compatible list wrapper for Tantivy search results.
Provides paginated access to search results while storing all hits in memory
for efficient ID retrieval. Used by Django REST framework for pagination.
Holds a lightweight ordered list of IDs (for pagination count and
``selection_data``) together with a small page of rich ``SearchHit``
dicts (for serialization). DRF's ``PageNumberPagination`` calls
``__len__`` to compute the total page count and ``__getitem__`` to
slice the displayed page.
Methods:
__len__: Returns total hit count for pagination calculations
__getitem__: Slices the hit list for page-specific results
Note: Stores ALL post-filter hits so get_all_result_ids() can return
every matching document ID without requiring a second search query.
Args:
ordered_ids: All matching document IDs in display order.
page_hits: Rich SearchHit dicts for the requested DRF page only.
page_offset: Index into *ordered_ids* where *page_hits* starts.
"""
def __init__(self, hits: list[SearchHit]) -> None:
self._hits = hits
def __init__(
self,
ordered_ids: list[int],
page_hits: list[SearchHit],
page_offset: int = 0,
) -> None:
self._ordered_ids = ordered_ids
self._page_hits = page_hits
self._page_offset = page_offset
def __len__(self) -> int:
return len(self._hits)
return len(self._ordered_ids)
def __getitem__(self, key: slice) -> list[SearchHit]:
return self._hits[key]
start = key.start or 0
stop = key.stop or len(self._ordered_ids)
# DRF slices to extract the current page. If the slice aligns
# with our pre-fetched page_hits, return them directly.
if start == self._page_offset and stop <= self._page_offset + len(
self._page_hits,
):
return self._page_hits[: stop - start]
# Fallback: return stub dicts (no highlights).
return [
SearchHit(id=doc_id, score=0.0, rank=start + i + 1, highlights={})
for i, doc_id in enumerate(self._ordered_ids[key])
]
def get_all_ids(self) -> list[int]:
"""Return all matching document IDs in display order."""
return self._ordered_ids
class SearchIndexLockError(Exception):
@@ -613,13 +637,111 @@ class TantivyBackend:
query=query,
)
def highlight_hits(
self,
query: str,
doc_ids: list[int],
*,
search_mode: SearchMode = SearchMode.QUERY,
) -> list[SearchHit]:
"""
Generate SearchHit dicts with highlights for specific document IDs.
Unlike search(), this does not execute a ranked query — it looks up
each document by ID and generates snippets against the provided query.
Use this when you already know which documents to display (from
search_ids + ORM filtering) and just need highlight data.
Args:
query: The search query (used for snippet generation)
doc_ids: Ordered list of document IDs to generate hits for
search_mode: Query parsing mode (for building the snippet query)
Returns:
List of SearchHit dicts in the same order as doc_ids
"""
if not doc_ids:
return []
self._ensure_open()
tz = get_current_timezone()
if search_mode is SearchMode.TEXT:
user_query = parse_simple_text_query(self._index, query)
elif search_mode is SearchMode.TITLE:
user_query = parse_simple_title_query(self._index, query)
else:
user_query = parse_user_query(self._index, query, tz)
searcher = self._index.searcher()
snippet_generator = None
hits: list[SearchHit] = []
for rank, doc_id in enumerate(doc_ids, start=1):
# Look up document by ID
id_query = tantivy.Query.range_query(
self._schema,
"id",
tantivy.FieldType.Unsigned,
doc_id,
doc_id,
)
results = searcher.search(id_query, limit=1)
if not results.hits:
continue
doc_address = results.hits[0][1]
actual_doc = searcher.doc(doc_address)
doc_dict = actual_doc.to_dict()
highlights: dict[str, str] = {}
try:
if snippet_generator is None:
snippet_generator = tantivy.SnippetGenerator.create(
searcher,
user_query,
self._schema,
"content",
)
content_snippet = snippet_generator.snippet_from_doc(actual_doc)
if content_snippet:
highlights["content"] = str(content_snippet)
if "notes" in doc_dict:
notes_generator = tantivy.SnippetGenerator.create(
searcher,
user_query,
self._schema,
"notes",
)
notes_snippet = notes_generator.snippet_from_doc(actual_doc)
if notes_snippet:
highlights["notes"] = str(notes_snippet)
except Exception: # pragma: no cover
logger.debug("Failed to generate highlights for doc %s", doc_id)
hits.append(
SearchHit(
id=doc_id,
score=0.0,
rank=rank,
highlights=highlights,
),
)
return hits
def search_ids(
self,
query: str,
user: AbstractBaseUser | None,
*,
sort_field: str | None = None,
sort_reverse: bool = False,
search_mode: SearchMode = SearchMode.QUERY,
limit: int = 10000,
limit: int | None = None,
) -> list[int]:
"""
Return document IDs matching a query — no highlights, no stored doc fetches.
@@ -631,11 +753,13 @@ class TantivyBackend:
Args:
query: User's search query
user: User for permission filtering (None for superuser/no filtering)
sort_field: Field to sort by (None for relevance ranking)
sort_reverse: Whether to reverse the sort order
search_mode: Query parsing mode (QUERY, TEXT, or TITLE)
limit: Maximum number of IDs to return
limit: Maximum number of IDs to return (None = all matching docs)
Returns:
List of document IDs in relevance order
List of document IDs in the requested order
"""
self._ensure_open()
tz = get_current_timezone()
@@ -658,22 +782,31 @@ class TantivyBackend:
final_query = user_query
searcher = self._index.searcher()
results = searcher.search(final_query, limit=limit)
effective_limit = limit if limit is not None else searcher.num_docs
all_hits = [(hit[1], hit[0]) for hit in results.hits]
if sort_field and sort_field in self.SORT_FIELD_MAP:
mapped_field = self.SORT_FIELD_MAP[sort_field]
results = searcher.search(
final_query,
limit=effective_limit,
order_by_field=mapped_field,
order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
)
all_hits = [(hit[1],) for hit in results.hits]
else:
results = searcher.search(final_query, limit=effective_limit)
all_hits = [(hit[1], hit[0]) for hit in results.hits]
# Normalize scores and apply threshold (same logic as search())
if all_hits:
max_score = max(hit[1] for hit in all_hits) or 1.0
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
# Normalize scores and apply threshold (relevance search only)
if all_hits:
max_score = max(hit[1] for hit in all_hits) or 1.0
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
if threshold is not None:
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
if threshold is not None:
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
return [
searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, _score in all_hits
]
return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]
def autocomplete(
self,
@@ -708,7 +841,7 @@ class TantivyBackend:
else:
base_query = tantivy.Query.all_query()
results = searcher.search(base_query, limit=10000)
results = searcher.search(base_query, limit=searcher.num_docs)
# Count how many visible documents each word appears in.
# Using Counter (not set) preserves per-word document frequency so
@@ -843,7 +976,7 @@ class TantivyBackend:
doc_id: int,
user: AbstractBaseUser | None,
*,
limit: int = 10000,
limit: int | None = None,
) -> list[int]:
"""
Return IDs of documents similar to the given document — no highlights.
@@ -854,7 +987,7 @@ class TantivyBackend:
Args:
doc_id: Primary key of the reference document
user: User for permission filtering (None for no filtering)
limit: Maximum number of IDs to return
limit: Maximum number of IDs to return (None = all matching docs)
Returns:
List of similar document IDs (excluding the original)
@@ -897,7 +1030,8 @@ class TantivyBackend:
else:
final_query = mlt_query
results = searcher.search(final_query, limit=limit)
effective_limit = limit if limit is not None else searcher.num_docs
results = searcher.search(final_query, limit=effective_limit)
ids = []
for _score, doc_address in results.hits:
+48 -44
View File
@@ -2058,6 +2058,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
if not self._is_search_request():
return super().list(request)
from documents.search import SearchHit
from documents.search import SearchMode
from documents.search import TantivyBackend
from documents.search import TantivyRelevanceList
@@ -2116,45 +2117,41 @@ class UnifiedSearchViewSet(DocumentViewSet):
search_mode = SearchMode.QUERY
query_str = request.query_params["query"]
# Step 1: Get all matching IDs (lightweight, no highlights)
all_ids = backend.search_ids(
query_str,
user=user,
sort_field=sort_field_name if use_tantivy_sort else None,
sort_reverse=sort_reverse,
search_mode=search_mode,
)
# Step 2: Intersect with ORM-visible IDs (field filters)
orm_ids = set(filtered_qs.values_list("pk", flat=True))
if use_tantivy_sort:
# Fast path: Tantivy sorts, highlights only for DRF page
results = backend.search(
query_str,
user=user,
page=1,
page_size=10000,
sort_field=sort_field_name,
sort_reverse=sort_reverse,
search_mode=search_mode,
highlight_page=requested_page,
highlight_page_size=requested_page_size,
# Fast path: Tantivy already ordered the IDs
ordered_ids = [doc_id for doc_id in all_ids if doc_id in orm_ids]
else:
# Slow path: ORM must re-sort
id_set = set(all_ids) & orm_ids
ordered_ids = list(
filtered_qs.filter(id__in=id_set).values_list(
"pk",
flat=True,
),
)
# Intersect with ORM-visible IDs (field filters)
orm_ids = set(filtered_qs.values_list("pk", flat=True))
ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
else:
# Slow path: custom field ordering — ORM must sort
results = backend.search(
query_str,
user=user,
page=1,
page_size=10000,
sort_field=None,
sort_reverse=False,
search_mode=search_mode,
highlight_page=requested_page,
highlight_page_size=requested_page_size,
)
hits_by_id = {h["id"]: h for h in results.hits}
hit_ids = set(hits_by_id.keys())
orm_ordered_ids = filtered_qs.filter(id__in=hit_ids).values_list(
"pk",
flat=True,
)
ordered_hits = [
hits_by_id[pk] for pk in orm_ordered_ids if pk in hits_by_id
]
# Step 3: Fetch highlights for the displayed page only
page_offset = (requested_page - 1) * requested_page_size
page_ids = ordered_ids[page_offset : page_offset + requested_page_size]
page_hits = backend.highlight_hits(
query_str,
page_ids,
search_mode=search_mode,
)
else:
# more_like_id path
try:
@@ -2172,16 +2169,24 @@ class UnifiedSearchViewSet(DocumentViewSet):
):
raise PermissionDenied(_("Insufficient permissions."))
results = backend.more_like_this(
# Step 1: Get all matching IDs (lightweight)
all_ids = backend.more_like_this_ids(
more_like_doc_id,
user=user,
page=1,
page_size=10000,
)
orm_ids = set(filtered_qs.values_list("pk", flat=True))
ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
ordered_ids = [doc_id for doc_id in all_ids if doc_id in orm_ids]
rl = TantivyRelevanceList(ordered_hits)
# Step 2: Build hit dicts for the displayed page
# MLT has no text query, so no highlights needed
page_offset = (requested_page - 1) * requested_page_size
page_ids = ordered_ids[page_offset : page_offset + requested_page_size]
page_hits = [
SearchHit(id=doc_id, score=0.0, rank=rank, highlights={})
for rank, doc_id in enumerate(page_ids, start=page_offset + 1)
]
rl = TantivyRelevanceList(ordered_ids, page_hits, page_offset)
page = self.paginate_queryset(rl)
if page is not None:
@@ -2191,15 +2196,14 @@ class UnifiedSearchViewSet(DocumentViewSet):
if get_boolean(
str(request.query_params.get("include_selection_data", "false")),
):
all_ids = [h["id"] for h in ordered_hits]
response.data["selection_data"] = (
self._get_selection_data_for_queryset(
filtered_qs.filter(pk__in=all_ids),
filtered_qs.filter(pk__in=ordered_ids),
)
)
return response
serializer = self.get_serializer(ordered_hits, many=True)
serializer = self.get_serializer(page_hits, many=True)
return Response(serializer.data)
except NotFound:
+1 -1
View File
@@ -89,7 +89,7 @@ class StandardPagination(PageNumberPagination):
query = self.page.paginator.object_list
if isinstance(query, TantivyRelevanceList):
return [h["id"] for h in query._hits]
return query.get_all_ids()
return self.page.paginator.object_list.values_list("pk", flat=True)
def get_paginated_response_schema(self, schema):