From 610ba27891bf00c41b9351cdd13629c10efd978e Mon Sep 17 00:00:00 2001
From: Trenton Holmes <797416+stumpylog@users.noreply.github.com>
Date: Sun, 5 Apr 2026 12:54:47 -0700
Subject: [PATCH] feat: replace 10000 overfetch with search_ids + page-only
 highlights

Use search_ids() for the full set of matching IDs (lightweight ints,
no arbitrary cap) and highlight_hits() for just the displayed page.
TantivyRelevanceList now holds ordered IDs for count/selection_data
and a small page of rich SearchHit dicts for serialization.

Removes the hardcoded 10000 limit that silently truncated results
for large collections. Memory usage down ~10% on sorted/paginated
search paths at 200 docs, with larger gains expected at scale.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/documents/search/__init__.py |   2 +
 src/documents/search/_backend.py | 198 ++++++++++++++++++++++++++-----
 src/documents/views.py           |  92 +++++++-------
 src/paperless/views.py           |   2 +-
 4 files changed, 217 insertions(+), 77 deletions(-)

diff --git a/src/documents/search/__init__.py b/src/documents/search/__init__.py
index a4145d7ef..1b8684166 100644
--- a/src/documents/search/__init__.py
+++ b/src/documents/search/__init__.py
@@ -1,3 +1,4 @@
+from documents.search._backend import SearchHit
 from documents.search._backend import SearchIndexLockError
 from documents.search._backend import SearchMode
 from documents.search._backend import SearchResults
@@ -10,6 +11,7 @@ from documents.search._schema import needs_rebuild
 from documents.search._schema import wipe_index
 
 __all__ = [
+    "SearchHit",
     "SearchIndexLockError",
     "SearchMode",
     "SearchResults",
diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py
index 6f532f37e..eb7e51391 100644
--- a/src/documents/search/_backend.py
+++ b/src/documents/search/_backend.py
@@ -106,27 +106,51 @@ class SearchResults:
 
 class TantivyRelevanceList:
     """
-    DRF-compatible list wrapper for Tantivy search hits.
+    DRF-compatible list wrapper for Tantivy search results.
 
-    Provides paginated access to search results while storing all hits in memory
-    for efficient ID retrieval. Used by Django REST framework for pagination.
+    Holds a lightweight ordered list of IDs (for pagination count and
+    ``selection_data``) together with a small page of rich ``SearchHit``
+    dicts (for serialization).  DRF's ``PageNumberPagination`` calls
+    ``__len__`` to compute the total page count and ``__getitem__`` to
+    slice the displayed page.
 
-    Methods:
-        __len__: Returns total hit count for pagination calculations
-        __getitem__: Slices the hit list for page-specific results
-
-    Note: Stores ALL post-filter hits so get_all_result_ids() can return
-    every matching document ID without requiring a second search query.
+    Args:
+        ordered_ids: All matching document IDs in display order.
+        page_hits: Rich SearchHit dicts for the requested DRF page only.
+        page_offset: Index into *ordered_ids* where *page_hits* starts.
     """
 
-    def __init__(self, hits: list[SearchHit]) -> None:
-        self._hits = hits
+    def __init__(
+        self,
+        ordered_ids: list[int],
+        page_hits: list[SearchHit],
+        page_offset: int = 0,
+    ) -> None:
+        self._ordered_ids = ordered_ids
+        self._page_hits = page_hits
+        self._page_offset = page_offset
 
     def __len__(self) -> int:
-        return len(self._hits)
+        return len(self._ordered_ids)
 
     def __getitem__(self, key: slice) -> list[SearchHit]:
-        return self._hits[key]
+        start = key.start or 0
+        stop = key.stop or len(self._ordered_ids)
+        # DRF slices to extract the current page.  If the slice aligns
+        # with our pre-fetched page_hits, return them directly.
+        if start == self._page_offset and stop <= self._page_offset + len(
+            self._page_hits,
+        ):
+            return self._page_hits[: stop - start]
+        # Fallback: return stub dicts (no highlights).
+        return [
+            SearchHit(id=doc_id, score=0.0, rank=start + i + 1, highlights={})
+            for i, doc_id in enumerate(self._ordered_ids[key])
+        ]
+
+    def get_all_ids(self) -> list[int]:
+        """Return all matching document IDs in display order."""
+        return self._ordered_ids
 
 
 class SearchIndexLockError(Exception):
@@ -613,13 +637,111 @@ class TantivyBackend:
             query=query,
         )
 
+    def highlight_hits(
+        self,
+        query: str,
+        doc_ids: list[int],
+        *,
+        search_mode: SearchMode = SearchMode.QUERY,
+    ) -> list[SearchHit]:
+        """
+        Generate SearchHit dicts with highlights for specific document IDs.
+
+        Unlike search(), this does not execute a ranked query — it looks up
+        each document by ID and generates snippets against the provided query.
+        Use this when you already know which documents to display (from
+        search_ids + ORM filtering) and just need highlight data.
+
+        Args:
+            query: The search query (used for snippet generation)
+            doc_ids: Ordered list of document IDs to generate hits for
+            search_mode: Query parsing mode (for building the snippet query)
+
+        Returns:
+            List of SearchHit dicts in the same order as doc_ids
+        """
+        if not doc_ids:
+            return []
+
+        self._ensure_open()
+        tz = get_current_timezone()
+        if search_mode is SearchMode.TEXT:
+            user_query = parse_simple_text_query(self._index, query)
+        elif search_mode is SearchMode.TITLE:
+            user_query = parse_simple_title_query(self._index, query)
+        else:
+            user_query = parse_user_query(self._index, query, tz)
+
+        searcher = self._index.searcher()
+        snippet_generator = None
+        hits: list[SearchHit] = []
+
+        for rank, doc_id in enumerate(doc_ids, start=1):
+            # Look up document by ID
+            id_query = tantivy.Query.range_query(
+                self._schema,
+                "id",
+                tantivy.FieldType.Unsigned,
+                doc_id,
+                doc_id,
+            )
+            results = searcher.search(id_query, limit=1)
+
+            if not results.hits:
+                continue
+
+            doc_address = results.hits[0][1]
+            actual_doc = searcher.doc(doc_address)
+            doc_dict = actual_doc.to_dict()
+
+            highlights: dict[str, str] = {}
+            try:
+                if snippet_generator is None:
+                    snippet_generator = tantivy.SnippetGenerator.create(
+                        searcher,
+                        user_query,
+                        self._schema,
+                        "content",
+                    )
+
+                content_snippet = snippet_generator.snippet_from_doc(actual_doc)
+                if content_snippet:
+                    highlights["content"] = str(content_snippet)
+
+                if "notes" in doc_dict:
+                    notes_generator = tantivy.SnippetGenerator.create(
+                        searcher,
+                        user_query,
+                        self._schema,
+                        "notes",
+                    )
+                    notes_snippet = notes_generator.snippet_from_doc(actual_doc)
+                    if notes_snippet:
+                        highlights["notes"] = str(notes_snippet)
+
+            except Exception:  # pragma: no cover
+                logger.debug("Failed to generate highlights for doc %s", doc_id)
+
+            hits.append(
+                SearchHit(
+                    id=doc_id,
+                    score=0.0,
+                    rank=rank,
+                    highlights=highlights,
+                ),
+            )
+
+        return hits
+
     def search_ids(
         self,
         query: str,
         user: AbstractBaseUser | None,
         *,
+        sort_field: str | None = None,
+        sort_reverse: bool = False,
         search_mode: SearchMode = SearchMode.QUERY,
-        limit: int = 10000,
+        limit: int | None = None,
     ) -> list[int]:
         """
         Return document IDs matching a query — no highlights, no stored doc fetches.
@@ -631,11 +753,13 @@ class TantivyBackend:
         Args:
             query: User's search query
             user: User for permission filtering (None for superuser/no filtering)
+            sort_field: Field to sort by (None for relevance ranking)
+            sort_reverse: Whether to reverse the sort order
             search_mode: Query parsing mode (QUERY, TEXT, or TITLE)
-            limit: Maximum number of IDs to return
+            limit: Maximum number of IDs to return (None = all matching docs)
 
         Returns:
-            List of document IDs in relevance order
+            List of document IDs in the requested order
         """
         self._ensure_open()
         tz = get_current_timezone()
@@ -658,22 +782,31 @@ class TantivyBackend:
             final_query = user_query
 
         searcher = self._index.searcher()
-        results = searcher.search(final_query, limit=limit)
+        effective_limit = limit if limit is not None else searcher.num_docs
 
-        all_hits = [(hit[1], hit[0]) for hit in results.hits]
+        if sort_field and sort_field in self.SORT_FIELD_MAP:
+            mapped_field = self.SORT_FIELD_MAP[sort_field]
+            results = searcher.search(
+                final_query,
+                limit=effective_limit,
+                order_by_field=mapped_field,
+                order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
+            )
+            all_hits = [(hit[1],) for hit in results.hits]
+        else:
+            results = searcher.search(final_query, limit=effective_limit)
+            all_hits = [(hit[1], hit[0]) for hit in results.hits]
 
-        # Normalize scores and apply threshold (same logic as search())
-        if all_hits:
-            max_score = max(hit[1] for hit in all_hits) or 1.0
-            all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
+            # Normalize scores and apply threshold (relevance search only)
+            if all_hits:
+                max_score = max(hit[1] for hit in all_hits) or 1.0
+                all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
 
-        threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
-        if threshold is not None:
-            all_hits = [hit for hit in all_hits if hit[1] >= threshold]
+            threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
+            if threshold is not None:
+                all_hits = [hit for hit in all_hits if hit[1] >= threshold]
 
-        return [
-            searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, _score in all_hits
-        ]
+        return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]
 
     def autocomplete(
         self,
@@ -708,7 +841,7 @@ class TantivyBackend:
         else:
             base_query = tantivy.Query.all_query()
 
-        results = searcher.search(base_query, limit=10000)
+        results = searcher.search(base_query, limit=searcher.num_docs)
 
         # Count how many visible documents each word appears in.
         # Using Counter (not set) preserves per-word document frequency so
@@ -843,7 +976,7 @@ class TantivyBackend:
         doc_id: int,
         user: AbstractBaseUser | None,
         *,
-        limit: int = 10000,
+        limit: int | None = None,
     ) -> list[int]:
         """
         Return IDs of documents similar to the given document — no highlights.
@@ -854,7 +987,7 @@ class TantivyBackend:
         Args:
             doc_id: Primary key of the reference document
             user: User for permission filtering (None for no filtering)
-            limit: Maximum number of IDs to return
+            limit: Maximum number of IDs to return (None = all matching docs)
 
         Returns:
             List of similar document IDs (excluding the original)
@@ -897,7 +1030,8 @@ class TantivyBackend:
         else:
             final_query = mlt_query
 
-        results = searcher.search(final_query, limit=limit)
+        effective_limit = limit if limit is not None else searcher.num_docs
+        results = searcher.search(final_query, limit=effective_limit)
 
         ids = []
         for _score, doc_address in results.hits:
diff --git a/src/documents/views.py b/src/documents/views.py
index b383f101b..a3a7c9138 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -2058,6 +2058,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
         if not self._is_search_request():
             return super().list(request)
 
+        from documents.search import SearchHit
         from documents.search import SearchMode
         from documents.search import TantivyBackend
         from documents.search import TantivyRelevanceList
@@ -2116,45 +2117,41 @@ class UnifiedSearchViewSet(DocumentViewSet):
                     search_mode = SearchMode.QUERY
                     query_str = request.query_params["query"]
 
+                # Step 1: Get all matching IDs (lightweight, no highlights)
+                all_ids = backend.search_ids(
+                    query_str,
+                    user=user,
+                    sort_field=sort_field_name if use_tantivy_sort else None,
+                    sort_reverse=sort_reverse,
+                    search_mode=search_mode,
+                )
+
+                # Step 2: Intersect with ORM-visible IDs (field filters)
+                orm_ids = set(filtered_qs.values_list("pk", flat=True))
+
                 if use_tantivy_sort:
-                    # Fast path: Tantivy sorts, highlights only for DRF page
-                    results = backend.search(
-                        query_str,
-                        user=user,
-                        page=1,
-                        page_size=10000,
-                        sort_field=sort_field_name,
-                        sort_reverse=sort_reverse,
-                        search_mode=search_mode,
-                        highlight_page=requested_page,
-                        highlight_page_size=requested_page_size,
+                    # Fast path: Tantivy already ordered the IDs
+                    ordered_ids = [doc_id for doc_id in all_ids if doc_id in orm_ids]
+                else:
+                    # Slow path: ORM must re-sort
+                    id_set = set(all_ids) & orm_ids
+                    ordered_ids = list(
+                        filtered_qs.filter(id__in=id_set).values_list(
+                            "pk",
+                            flat=True,
+                        ),
                     )
 
-                    # Intersect with ORM-visible IDs (field filters)
-                    orm_ids = set(filtered_qs.values_list("pk", flat=True))
-                    ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
-                else:
-                    # Slow path: custom field ordering — ORM must sort
-                    results = backend.search(
-                        query_str,
-                        user=user,
-                        page=1,
-                        page_size=10000,
-                        sort_field=None,
-                        sort_reverse=False,
-                        search_mode=search_mode,
-                        highlight_page=requested_page,
-                        highlight_page_size=requested_page_size,
-                    )
-                    hits_by_id = {h["id"]: h for h in results.hits}
-                    hit_ids = set(hits_by_id.keys())
-                    orm_ordered_ids = filtered_qs.filter(id__in=hit_ids).values_list(
-                        "pk",
-                        flat=True,
-                    )
-                    ordered_hits = [
-                        hits_by_id[pk] for pk in orm_ordered_ids if pk in hits_by_id
-                    ]
+                # Step 3: Fetch highlights for the displayed page only
+                page_offset = (requested_page - 1) * requested_page_size
+                page_ids = ordered_ids[page_offset : page_offset + requested_page_size]
+
+                page_hits = backend.highlight_hits(
+                    query_str,
+                    page_ids,
+                    search_mode=search_mode,
+                )
+
             else:
                 # more_like_id path
                 try:
@@ -2172,16 +2169,24 @@ class UnifiedSearchViewSet(DocumentViewSet):
                 ):
                     raise PermissionDenied(_("Insufficient permissions."))
 
-                results = backend.more_like_this(
+                # Step 1: Get all matching IDs (lightweight)
+                all_ids = backend.more_like_this_ids(
                     more_like_doc_id,
                     user=user,
-                    page=1,
-                    page_size=10000,
                 )
                 orm_ids = set(filtered_qs.values_list("pk", flat=True))
-                ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
+                ordered_ids = [doc_id for doc_id in all_ids if doc_id in orm_ids]
 
-            rl = TantivyRelevanceList(ordered_hits)
+                # Step 2: Build hit dicts for the displayed page
+                # MLT has no text query, so no highlights needed
+                page_offset = (requested_page - 1) * requested_page_size
+                page_ids = ordered_ids[page_offset : page_offset + requested_page_size]
+                page_hits = [
+                    SearchHit(id=doc_id, score=0.0, rank=rank, highlights={})
+                    for rank, doc_id in enumerate(page_ids, start=page_offset + 1)
+                ]
+
+            rl = TantivyRelevanceList(ordered_ids, page_hits, page_offset)
             page = self.paginate_queryset(rl)
 
             if page is not None:
@@ -2191,15 +2196,14 @@ class UnifiedSearchViewSet(DocumentViewSet):
                 if get_boolean(
                     str(request.query_params.get("include_selection_data", "false")),
                 ):
-                    all_ids = [h["id"] for h in ordered_hits]
                     response.data["selection_data"] = (
                         self._get_selection_data_for_queryset(
-                            filtered_qs.filter(pk__in=all_ids),
+                            filtered_qs.filter(pk__in=ordered_ids),
                         )
                     )
                 return response
 
-            serializer = self.get_serializer(ordered_hits, many=True)
+            serializer = self.get_serializer(page_hits, many=True)
             return Response(serializer.data)
 
         except NotFound:
diff --git a/src/paperless/views.py b/src/paperless/views.py
index e4db40bb4..2192ed7b6 100644
--- a/src/paperless/views.py
+++ b/src/paperless/views.py
@@ -89,7 +89,7 @@ class StandardPagination(PageNumberPagination):
 
         query = self.page.paginator.object_list
         if isinstance(query, TantivyRelevanceList):
-            return [h["id"] for h in query._hits]
+            return query.get_all_ids()
         return self.page.paginator.object_list.values_list("pk", flat=True)
 
     def get_paginated_response_schema(self, schema):