Feature: paginate search highlights and remove 10k document search limit (#12518)

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2026-05-23 06:55:23 +00:00 · 2026-04-15 16:20:31 -07:00
parent 21db608d57
commit 3ffbb8862c
12 changed files with 904 additions and 664 deletions
@@ -1,9 +1,9 @@
 from __future__ import annotations

 import logging
+import re
 import threading
 from collections import Counter
-from dataclasses import dataclass
 from datetime import UTC
 from datetime import datetime
 from enum import StrEnum
@@ -88,45 +88,63 @@ class SearchHit(TypedDict):
    highlights: dict[str, str]


-@dataclass(frozen=True, slots=True)
-class SearchResults:
-    """
-    Container for search results with pagination metadata.
-
-    Attributes:
-        hits: List of search results with scores and highlights
-        total: Total matching documents across all pages (for pagination)
-        query: Preprocessed query string after date/syntax rewriting
-    """
-
-    hits: list[SearchHit]
-    total: int  # total matching documents (for pagination)
-    query: str  # preprocessed query string
-
-
 class TantivyRelevanceList:
    """
-    DRF-compatible list wrapper for Tantivy search hits.
+    DRF-compatible list wrapper for Tantivy search results.

-    Provides paginated access to search results while storing all hits in memory
-    for efficient ID retrieval. Used by Django REST framework for pagination.
+    Holds a lightweight ordered list of IDs (for pagination count and
+    ``selection_data``) together with a small page of rich ``SearchHit``
+    dicts (for serialization).  DRF's ``PageNumberPagination`` calls
+    ``__len__`` to compute the total page count and ``__getitem__`` to
+    slice the displayed page.

-    Methods:
-        __len__: Returns total hit count for pagination calculations
-        __getitem__: Slices the hit list for page-specific results
-
-    Note: Stores ALL post-filter hits so get_all_result_ids() can return
-    every matching document ID without requiring a second search query.
+    Args:
+        ordered_ids: All matching document IDs in display order.
+        page_hits: Rich SearchHit dicts for the requested DRF page only.
+        page_offset: Index into *ordered_ids* where *page_hits* starts.
    """

-    def __init__(self, hits: list[SearchHit]) -> None:
-        self._hits = hits
+    def __init__(
+        self,
+        ordered_ids: list[int],
+        page_hits: list[SearchHit],
+        page_offset: int = 0,
+    ) -> None:
+        self._ordered_ids = ordered_ids
+        self._page_hits = page_hits
+        self._page_offset = page_offset

    def __len__(self) -> int:
-        return len(self._hits)
+        return len(self._ordered_ids)

-    def __getitem__(self, key: slice) -> list[SearchHit]:
-        return self._hits[key]
+    def __getitem__(self, key: int | slice) -> SearchHit | list[SearchHit]:
+        if isinstance(key, int):
+            idx = key if key >= 0 else len(self._ordered_ids) + key
+            if self._page_offset <= idx < self._page_offset + len(self._page_hits):
+                return self._page_hits[idx - self._page_offset]
+            return SearchHit(
+                id=self._ordered_ids[key],
+                score=0.0,
+                rank=idx + 1,
+                highlights={},
+            )
+        start = key.start or 0
+        stop = key.stop or len(self._ordered_ids)
+        # DRF slices to extract the current page.  If the slice aligns
+        # with our pre-fetched page_hits, return them directly.
+        # We only check start — DRF always slices with stop=start+page_size,
+        # which exceeds page_hits length on the last page.
+        if start == self._page_offset:
+            return self._page_hits[: stop - start]
+        # Fallback: return stub dicts (no highlights).
+        return [
+            SearchHit(id=doc_id, score=0.0, rank=start + i + 1, highlights={})
+            for i, doc_id in enumerate(self._ordered_ids[key])
+        ]
+
+    def get_all_ids(self) -> list[int]:
+        """Return all matching document IDs in display order."""
+        return self._ordered_ids


 class SearchIndexLockError(Exception):
@@ -206,10 +224,13 @@ class WriteBatch:
        """
        Remove a document from the batch by its primary key.

-        Uses range query instead of term query to work around unsigned integer
-        type detection bug in tantivy-py 0.25.
+        Uses range_query instead of term_query to work around a tantivy-py bug
+        where Python integers are inferred as i64, producing Terms that never
+        match u64 fields.
+
+        TODO: Replace with term_query("id", doc_id) once
+        https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
        """
-        # Use range query to work around u64 deletion bug
        self._writer.delete_documents_by_query(
            tantivy.Query.range_query(
                self._backend._schema,
@@ -234,6 +255,34 @@ class TantivyBackend:
    the underlying index directory changes (e.g., during test isolation).
    """

+    # Maps DRF ordering field names to Tantivy index field names.
+    SORT_FIELD_MAP: dict[str, str] = {
+        "title": "title_sort",
+        "correspondent__name": "correspondent_sort",
+        "document_type__name": "type_sort",
+        "created": "created",
+        "added": "added",
+        "modified": "modified",
+        "archive_serial_number": "asn",
+        "page_count": "page_count",
+        "num_notes": "num_notes",
+    }
+
+    # Fields where Tantivy's sort order matches the ORM's sort order.
+    # Text-based fields (title, correspondent__name, document_type__name)
+    # are excluded because Tantivy's tokenized fast fields produce different
+    # ordering than the ORM's collation-based ordering.
+    SORTABLE_FIELDS: frozenset[str] = frozenset(
+        {
+            "created",
+            "added",
+            "modified",
+            "archive_serial_number",
+            "page_count",
+            "num_notes",
+        },
+    )
+
    def __init__(self, path: Path | None = None):
        # path=None → in-memory index (for tests)
        # path=some_dir → on-disk index (for production)
@@ -272,6 +321,36 @@ class TantivyBackend:
        if self._index is None:
            self.open()  # pragma: no cover

+    def _parse_query(
+        self,
+        query: str,
+        search_mode: SearchMode,
+    ) -> tantivy.Query:
+        """Parse a user query string into a Tantivy Query object."""
+        tz = get_current_timezone()
+        if search_mode is SearchMode.TEXT:
+            return parse_simple_text_query(self._index, query)
+        elif search_mode is SearchMode.TITLE:
+            return parse_simple_title_query(self._index, query)
+        else:
+            return parse_user_query(self._index, query, tz)
+
+    def _apply_permission_filter(
+        self,
+        query: tantivy.Query,
+        user: AbstractBaseUser | None,
+    ) -> tantivy.Query:
+        """Wrap a query with a permission filter if the user is not a superuser."""
+        if user is not None:
+            permission_filter = build_permission_filter(self._schema, user)
+            return tantivy.Query.boolean_query(
+                [
+                    (tantivy.Occur.Must, query),
+                    (tantivy.Occur.Must, permission_filter),
+                ],
+            )
+        return query
+
    def _build_tantivy_doc(
        self,
        document: Document,
@@ -326,12 +405,17 @@ class TantivyBackend:
            doc.add_unsigned("tag_id", tag.pk)
            tag_names.append(tag.name)

-        # Notes — JSON for structured queries (notes.user:alice, notes.note:text),
-        # companion text field for default full-text search.
+        # Notes — JSON for structured queries (notes.user:alice, notes.note:text).
+        # notes_text is a plain-text companion for snippet/highlight generation;
+        # tantivy's SnippetGenerator does not support JSON fields.
        num_notes = 0
+        note_texts: list[str] = []
        for note in document.notes.all():
            num_notes += 1
            doc.add_json("notes", {"note": note.note, "user": note.user.username})
+            note_texts.append(note.note)
+        if note_texts:
+            doc.add_text("notes_text", " ".join(note_texts))

        # Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y),
        # companion text field for default full-text search.
@@ -425,155 +509,125 @@ class TantivyBackend:
        with self.batch_update(lock_timeout=5.0) as batch:
            batch.remove(doc_id)

-    def search(
+    def highlight_hits(
        self,
        query: str,
-        user: AbstractBaseUser | None,
-        page: int,
-        page_size: int,
-        sort_field: str | None,
+        doc_ids: list[int],
        *,
-        sort_reverse: bool,
        search_mode: SearchMode = SearchMode.QUERY,
-    ) -> SearchResults:
+        rank_start: int = 1,
+    ) -> list[SearchHit]:
        """
-        Execute a search query against the document index.
+        Generate SearchHit dicts with highlights for specific document IDs.

-        Processes the user query through date rewriting, normalization, and
-        permission filtering before executing against Tantivy. Supports both
-        relevance-based and field-based sorting.
+        Unlike search(), this does not execute a ranked query — it looks up
+        each document by ID and generates snippets against the provided query.
+        Use this when you already know which documents to display (from
+        search_ids + ORM filtering) and just need highlight data.

-        QUERY search mode supports natural date keywords, field filters, etc.
-        TITLE search mode treats the query as plain text to search for in title only
-        TEXT search mode treats the query as plain text to search for in title and content
+        Note: Each doc_id requires an individual index lookup because tantivy-py
+        does not yet expose a batch fast-field read API. This is acceptable for
+        page-sized batches (typically 25 docs) but should not be called with
+        thousands of IDs.
+
+        TODO: When https://github.com/quickwit-oss/tantivy-py/pull/641 lands,
+        the per-doc range_query lookups here can be replaced with a single
+        collect_u64_fast_field("id", doc_addresses) call.

        Args:
-            query: User's search query
-            user: User for permission filtering (None for superuser/no filtering)
-            page: Page number (1-indexed) for pagination
-            page_size: Number of results per page
-            sort_field: Field to sort by (None for relevance ranking)
-            sort_reverse: Whether to reverse the sort order
-            search_mode: "query" for advanced Tantivy syntax, "text" for
-                plain-text search over title and content only, "title" for
-                plain-text search over title only
+            query: The search query (used for snippet generation)
+            doc_ids: Ordered list of document IDs to generate hits for
+            search_mode: Query parsing mode (for building the snippet query)
+            rank_start: Starting rank value (1-based absolute position in the
+                full result set; pass ``page_offset + 1`` for paginated calls)

        Returns:
-            SearchResults with hits, total count, and processed query
+            List of SearchHit dicts in the same order as doc_ids
        """
-        self._ensure_open()
-        tz = get_current_timezone()
-        if search_mode is SearchMode.TEXT:
-            user_query = parse_simple_text_query(self._index, query)
-        elif search_mode is SearchMode.TITLE:
-            user_query = parse_simple_title_query(self._index, query)
-        else:
-            user_query = parse_user_query(self._index, query, tz)
+        if not doc_ids:
+            return []

-        # Apply permission filter if user is not None (not superuser)
-        if user is not None:
-            permission_filter = build_permission_filter(self._schema, user)
-            final_query = tantivy.Query.boolean_query(
-                [
-                    (tantivy.Occur.Must, user_query),
-                    (tantivy.Occur.Must, permission_filter),
-                ],
+        self._ensure_open()
+        user_query = self._parse_query(query, search_mode)
+
+        # For notes_text snippet generation, we need a query that targets the
+        # notes_text field directly. user_query may contain JSON-field terms
+        # (e.g. notes.note:urgent) that the SnippetGenerator cannot resolve
+        # against a text field. Strip field:value prefixes so bare terms like
+        # "urgent" are re-parsed against notes_text, producing highlights even
+        # when the original query used structured syntax.
+        bare_query = re.sub(r"\w[\w.]*:", "", query).strip()
+        try:
+            notes_text_query = (
+                self._index.parse_query(bare_query, ["notes_text"])
+                if bare_query
+                else user_query
            )
-        else:
-            final_query = user_query
+        except Exception:
+            notes_text_query = user_query

        searcher = self._index.searcher()
-        offset = (page - 1) * page_size
-
-        # Map sort fields
-        sort_field_map = {
-            "title": "title_sort",
-            "correspondent__name": "correspondent_sort",
-            "document_type__name": "type_sort",
-            "created": "created",
-            "added": "added",
-            "modified": "modified",
-            "archive_serial_number": "asn",
-            "page_count": "page_count",
-            "num_notes": "num_notes",
-        }
-
-        # Perform search
-        if sort_field and sort_field in sort_field_map:
-            mapped_field = sort_field_map[sort_field]
-            results = searcher.search(
-                final_query,
-                limit=offset + page_size,
-                order_by_field=mapped_field,
-                order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
-            )
-            # Field sorting: hits are still (score, DocAddress) tuples; score unused
-            all_hits = [(hit[1], 0.0) for hit in results.hits]
-        else:
-            # Score-based search: hits are (score, DocAddress) tuples
-            results = searcher.search(final_query, limit=offset + page_size)
-            all_hits = [(hit[1], hit[0]) for hit in results.hits]
-
-        total = results.count
-
-        # Normalize scores for score-based searches
-        if not sort_field and all_hits:
-            max_score = max(hit[1] for hit in all_hits) or 1.0
-            all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
-
-        # Apply threshold filter if configured (score-based search only)
-        threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
-        if threshold is not None and not sort_field:
-            all_hits = [hit for hit in all_hits if hit[1] >= threshold]
-
-        # Get the page's hits
-        page_hits = all_hits[offset : offset + page_size]
-
-        # Build result hits with highlights
-        hits: list[SearchHit] = []
        snippet_generator = None
        notes_snippet_generator = None
+        hits: list[SearchHit] = []

-        for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
-            # Get the actual document from the searcher using the doc address
+        for rank, doc_id in enumerate(doc_ids, start=rank_start):
+            # Look up document by ID, scoring against the user query so that
+            # the returned SearchHit carries a real BM25 relevance score.
+            id_query = tantivy.Query.range_query(
+                self._schema,
+                "id",
+                tantivy.FieldType.Unsigned,
+                doc_id,
+                doc_id,
+            )
+            scored_query = tantivy.Query.boolean_query(
+                [
+                    (tantivy.Occur.Must, user_query),
+                    (tantivy.Occur.Must, id_query),
+                ],
+            )
+            results = searcher.search(scored_query, limit=1)
+
+            if not results.hits:
+                continue
+
+            score, doc_address = results.hits[0]
            actual_doc = searcher.doc(doc_address)
            doc_dict = actual_doc.to_dict()
-            doc_id = doc_dict["id"][0]

            highlights: dict[str, str] = {}
+            try:
+                if snippet_generator is None:
+                    snippet_generator = tantivy.SnippetGenerator.create(
+                        searcher,
+                        user_query,
+                        self._schema,
+                        "content",
+                    )

-            # Generate highlights if score > 0
-            if score > 0:
-                try:
-                    if snippet_generator is None:
-                        snippet_generator = tantivy.SnippetGenerator.create(
+                content_html = snippet_generator.snippet_from_doc(actual_doc).to_html()
+                if content_html:
+                    highlights["content"] = content_html
+
+                if "notes_text" in doc_dict:
+                    # Use notes_text (plain text) for snippet generation — tantivy's
+                    # SnippetGenerator does not support JSON fields.
+                    if notes_snippet_generator is None:
+                        notes_snippet_generator = tantivy.SnippetGenerator.create(
                            searcher,
-                            final_query,
+                            notes_text_query,
                            self._schema,
-                            "content",
+                            "notes_text",
                        )
+                    notes_html = notes_snippet_generator.snippet_from_doc(
+                        actual_doc,
+                    ).to_html()
+                    if notes_html:
+                        highlights["notes"] = notes_html

-                    content_snippet = snippet_generator.snippet_from_doc(actual_doc)
-                    if content_snippet:
-                        highlights["content"] = str(content_snippet)
-
-                    # Try notes highlights
-                    if "notes" in doc_dict:
-                        if notes_snippet_generator is None:
-                            notes_snippet_generator = tantivy.SnippetGenerator.create(
-                                searcher,
-                                final_query,
-                                self._schema,
-                                "notes",
-                            )
-                        notes_snippet = notes_snippet_generator.snippet_from_doc(
-                            actual_doc,
-                        )
-                        if notes_snippet:
-                            highlights["notes"] = str(notes_snippet)
-
-                except Exception:  # pragma: no cover
-                    logger.debug("Failed to generate highlights for doc %s", doc_id)
+            except Exception:  # pragma: no cover
+                logger.debug("Failed to generate highlights for doc %s", doc_id)

            hits.append(
                SearchHit(
@@ -584,11 +638,69 @@ class TantivyBackend:
                ),
            )

-        return SearchResults(
-            hits=hits,
-            total=total,
-            query=query,
-        )
+        return hits
+
+    def search_ids(
+        self,
+        query: str,
+        user: AbstractBaseUser | None,
+        *,
+        sort_field: str | None = None,
+        sort_reverse: bool = False,
+        search_mode: SearchMode = SearchMode.QUERY,
+        limit: int | None = None,
+    ) -> list[int]:
+        """
+        Return document IDs matching a query — no highlights or scores.
+
+        This is the lightweight companion to search(). Use it when you need the
+        full set of matching IDs (e.g. for ``selection_data``) but don't need
+        scores, ranks, or highlights.
+
+        Args:
+            query: User's search query
+            user: User for permission filtering (None for superuser/no filtering)
+            sort_field: Field to sort by (None for relevance ranking)
+            sort_reverse: Whether to reverse the sort order
+            search_mode: Query parsing mode (QUERY, TEXT, or TITLE)
+            limit: Maximum number of IDs to return (None = all matching docs)
+
+        Returns:
+            List of document IDs in the requested order
+        """
+        self._ensure_open()
+        user_query = self._parse_query(query, search_mode)
+        final_query = self._apply_permission_filter(user_query, user)
+
+        searcher = self._index.searcher()
+        effective_limit = limit if limit is not None else searcher.num_docs
+
+        if sort_field and sort_field in self.SORT_FIELD_MAP:
+            mapped_field = self.SORT_FIELD_MAP[sort_field]
+            results = searcher.search(
+                final_query,
+                limit=effective_limit,
+                order_by_field=mapped_field,
+                order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
+            )
+            all_hits = [(hit[1],) for hit in results.hits]
+        else:
+            results = searcher.search(final_query, limit=effective_limit)
+            all_hits = [(hit[1], hit[0]) for hit in results.hits]
+
+            # Normalize scores and apply threshold (relevance search only)
+            if all_hits:
+                max_score = max(hit[1] for hit in all_hits) or 1.0
+                all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
+
+            threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
+            if threshold is not None:
+                all_hits = [hit for hit in all_hits if hit[1] >= threshold]
+
+        # TODO: Replace with searcher.collect_u64_fast_field("id", addrs) once
+        # https://github.com/quickwit-oss/tantivy-py/pull/641 lands — eliminates
+        # one stored-doc fetch per result (~80% reduction in search_ids latency).
+        return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]

    def autocomplete(
        self,
@@ -603,6 +715,10 @@ class TantivyBackend:
        frequency (how many documents contain each word). Optionally filters
        results to only words from documents visible to the specified user.

+        NOTE: This is the hottest search path (called per keystroke).
+        A future improvement would be to cache results in Redis, keyed by
+        (prefix, user_id), and invalidate on index writes.
+
        Args:
            term: Prefix to match against autocomplete words
            limit: Maximum number of suggestions to return
@@ -613,64 +729,94 @@ class TantivyBackend:
        """
        self._ensure_open()
        normalized_term = ascii_fold(term.lower())
+        if not normalized_term:
+            return []

        searcher = self._index.searcher()

-        # Apply permission filter for non-superusers so autocomplete words
-        # from invisible documents don't leak to other users.
+        # Build a prefix query on autocomplete_word so we only scan docs
+        # containing words that start with the prefix, not the entire index.
+        # tantivy regex is implicitly anchored; .+ avoids the empty-match
+        # error that .* triggers.  We OR with term_query to also match the
+        # exact prefix as a complete word.
+        escaped = re.escape(normalized_term)
+        prefix_query = tantivy.Query.boolean_query(
+            [
+                (
+                    tantivy.Occur.Should,
+                    tantivy.Query.term_query(
+                        self._schema,
+                        "autocomplete_word",
+                        normalized_term,
+                    ),
+                ),
+                (
+                    tantivy.Occur.Should,
+                    tantivy.Query.regex_query(
+                        self._schema,
+                        "autocomplete_word",
+                        f"{escaped}.+",
+                    ),
+                ),
+            ],
+        )
+
+        # Intersect with permission filter so autocomplete words from
+        # invisible documents don't leak to other users.
        if user is not None and not user.is_superuser:
-            base_query = build_permission_filter(self._schema, user)
+            final_query = tantivy.Query.boolean_query(
+                [
+                    (tantivy.Occur.Must, prefix_query),
+                    (tantivy.Occur.Must, build_permission_filter(self._schema, user)),
+                ],
+            )
        else:
-            base_query = tantivy.Query.all_query()
+            final_query = prefix_query

-        results = searcher.search(base_query, limit=10000)
+        results = searcher.search(final_query, limit=searcher.num_docs)

-        # Count how many visible documents each word appears in.
-        # Using Counter (not set) preserves per-word document frequency so
-        # we can rank suggestions by how commonly they occur — the same
-        # signal Whoosh used for Tf/Idf-based autocomplete ordering.
+        # Count how many visible documents each matching word appears in.
        word_counts: Counter[str] = Counter()
        for _score, doc_address in results.hits:
            stored_doc = searcher.doc(doc_address)
            doc_dict = stored_doc.to_dict()
            if "autocomplete_word" in doc_dict:
-                word_counts.update(doc_dict["autocomplete_word"])
+                for word in doc_dict["autocomplete_word"]:
+                    if word.startswith(normalized_term):
+                        word_counts[word] += 1

-        # Filter to prefix matches, sort by document frequency descending;
-        # break ties alphabetically for stable, deterministic output.
+        # Sort by document frequency descending; break ties alphabetically.
        matches = sorted(
-            (w for w in word_counts if w.startswith(normalized_term)),
+            word_counts,
            key=lambda w: (-word_counts[w], w),
        )

        return matches[:limit]

-    def more_like_this(
+    def more_like_this_ids(
        self,
        doc_id: int,
        user: AbstractBaseUser | None,
-        page: int,
-        page_size: int,
-    ) -> SearchResults:
+        *,
+        limit: int | None = None,
+    ) -> list[int]:
        """
-        Find documents similar to the given document using content analysis.
+        Return IDs of documents similar to the given document — no highlights.

-        Uses Tantivy's "more like this" query to find documents with similar
-        content patterns. The original document is excluded from results.
+        Lightweight companion to more_like_this(). The original document is
+        excluded from results.

        Args:
            doc_id: Primary key of the reference document
            user: User for permission filtering (None for no filtering)
-            page: Page number (1-indexed) for pagination
-            page_size: Number of results per page
+            limit: Maximum number of IDs to return (None = all matching docs)

        Returns:
-            SearchResults with similar documents (excluding the original)
+            List of similar document IDs (excluding the original)
        """
        self._ensure_open()
        searcher = self._index.searcher()

-        # First find the document address
        id_query = tantivy.Query.range_query(
            self._schema,
            "id",
@@ -681,13 +827,9 @@ class TantivyBackend:
        results = searcher.search(id_query, limit=1)

        if not results.hits:
-            # Document not found
-            return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}")
+            return []

-        # Extract doc_address from (score, doc_address) tuple
        doc_address = results.hits[0][1]
-
-        # Build more like this query
        mlt_query = tantivy.Query.more_like_this_query(
            doc_address,
            min_doc_frequency=1,
@@ -699,59 +841,21 @@ class TantivyBackend:
            boost_factor=None,
        )

-        # Apply permission filter
-        if user is not None:
-            permission_filter = build_permission_filter(self._schema, user)
-            final_query = tantivy.Query.boolean_query(
-                [
-                    (tantivy.Occur.Must, mlt_query),
-                    (tantivy.Occur.Must, permission_filter),
-                ],
-            )
-        else:
-            final_query = mlt_query
+        final_query = self._apply_permission_filter(mlt_query, user)

-        # Search
-        offset = (page - 1) * page_size
-        results = searcher.search(final_query, limit=offset + page_size)
+        effective_limit = limit if limit is not None else searcher.num_docs
+        # Fetch one extra to account for excluding the original document
+        results = searcher.search(final_query, limit=effective_limit + 1)

-        total = results.count
-        # Convert from (score, doc_address) to (doc_address, score)
-        all_hits = [(hit[1], hit[0]) for hit in results.hits]
+        # TODO: Replace with collect_u64_fast_field("id", addrs) once
+        # https://github.com/quickwit-oss/tantivy-py/pull/641 lands.
+        ids = []
+        for _score, doc_address in results.hits:
+            result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
+            if result_doc_id != doc_id:
+                ids.append(result_doc_id)

-        # Normalize scores
-        if all_hits:
-            max_score = max(hit[1] for hit in all_hits) or 1.0
-            all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
-
-        # Get page hits
-        page_hits = all_hits[offset : offset + page_size]
-
-        # Build results
-        hits: list[SearchHit] = []
-        for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
-            actual_doc = searcher.doc(doc_address)
-            doc_dict = actual_doc.to_dict()
-            result_doc_id = doc_dict["id"][0]
-
-            # Skip the original document
-            if result_doc_id == doc_id:
-                continue
-
-            hits.append(
-                SearchHit(
-                    id=result_doc_id,
-                    score=score,
-                    rank=rank,
-                    highlights={},  # MLT doesn't generate highlights
-                ),
-            )
-
-        return SearchResults(
-            hits=hits,
-            total=max(0, total - 1),  # Subtract 1 for the original document
-            query=f"more_like:{doc_id}",
-        )
+        return ids[:limit] if limit is not None else ids

    def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
        """