paperless-ngx/src/documents/search/_backend.py

from __future__ import annotations

import logging
import threading
from collections import Counter
from dataclasses import dataclass
from datetime import UTC
from datetime import datetime
from enum import StrEnum
from typing import TYPE_CHECKING
from typing import Self
from typing import TypedDict
from typing import TypeVar

import filelock
import regex
import tantivy
from django.conf import settings
from django.utils.timezone import get_current_timezone
from guardian.shortcuts import get_users_with_perms

from documents.search._normalize import ascii_fold
from documents.search._query import build_permission_filter
from documents.search._query import parse_simple_text_query
from documents.search._query import parse_simple_title_query
from documents.search._query import parse_user_query
from documents.search._schema import _write_sentinels
from documents.search._schema import build_schema
from documents.search._schema import open_or_rebuild_index
from documents.search._schema import wipe_index
from documents.search._tokenizer import register_tokenizers
from documents.utils import IterWrapper
from documents.utils import identity

if TYPE_CHECKING:
    from pathlib import Path

    from django.contrib.auth.base_user import AbstractBaseUser
    from django.db.models import QuerySet

    from documents.models import Document

logger = logging.getLogger("paperless.search")

_WORD_RE = regex.compile(r"\w+")
_AUTOCOMPLETE_REGEX_TIMEOUT = 1.0  # seconds; guards against ReDoS on untrusted content

T = TypeVar("T")


class SearchMode(StrEnum):
    QUERY = "query"
    TEXT = "text"
    TITLE = "title"


def _extract_autocomplete_words(text_sources: list[str]) -> set[str]:
    """Extract and normalize words for autocomplete.

    Splits on non-word characters (matching Tantivy's simple tokenizer), lowercases,
    and ascii-folds each token. Uses the regex library with a timeout to guard against
    ReDoS on untrusted document content.
    """
    words = set()
    for text in text_sources:
        if not text:
            continue
        try:
            tokens = _WORD_RE.findall(text, timeout=_AUTOCOMPLETE_REGEX_TIMEOUT)
        except TimeoutError:  # pragma: no cover
            logger.warning(
                "Autocomplete word extraction timed out for a text source; skipping.",
            )
            continue
        for token in tokens:
            normalized = ascii_fold(token.lower())
            if normalized:
                words.add(normalized)
    return words


class SearchHit(TypedDict):
    """Type definition for search result hits."""

    id: int
    score: float
    rank: int
    highlights: dict[str, str]


@dataclass(frozen=True, slots=True)
class SearchResults:
    """
    Container for search results with pagination metadata.

    Attributes:
        hits: List of search results with scores and highlights
        total: Total matching documents across all pages (for pagination)
        query: Preprocessed query string after date/syntax rewriting
    """

    hits: list[SearchHit]
    total: int  # total matching documents (for pagination)
    query: str  # preprocessed query string


class TantivyRelevanceList:
    """
    DRF-compatible list wrapper for Tantivy search results.

    Holds a lightweight ordered list of IDs (for pagination count and
    ``selection_data``) together with a small page of rich ``SearchHit``
    dicts (for serialization).  DRF's ``PageNumberPagination`` calls
    ``__len__`` to compute the total page count and ``__getitem__`` to
    slice the displayed page.

    Args:
        ordered_ids: All matching document IDs in display order.
        page_hits: Rich SearchHit dicts for the requested DRF page only.
        page_offset: Index into *ordered_ids* where *page_hits* starts.
    """

    def __init__(
        self,
        ordered_ids: list[int],
        page_hits: list[SearchHit],
        page_offset: int = 0,
    ) -> None:
        self._ordered_ids = ordered_ids
        self._page_hits = page_hits
        self._page_offset = page_offset

    def __len__(self) -> int:
        return len(self._ordered_ids)

    def __getitem__(self, key: int | slice) -> SearchHit | list[SearchHit]:
        if isinstance(key, int):
            idx = key if key >= 0 else len(self._ordered_ids) + key
            if self._page_offset <= idx < self._page_offset + len(self._page_hits):
                return self._page_hits[idx - self._page_offset]
            return SearchHit(
                id=self._ordered_ids[key],
                score=0.0,
                rank=idx + 1,
                highlights={},
            )
        start = key.start or 0
        stop = key.stop or len(self._ordered_ids)
        # DRF slices to extract the current page.  If the slice aligns
        # with our pre-fetched page_hits, return them directly.
        if start == self._page_offset and stop <= self._page_offset + len(
            self._page_hits,
        ):
            return self._page_hits[: stop - start]
        # Fallback: return stub dicts (no highlights).
        return [
            SearchHit(id=doc_id, score=0.0, rank=start + i + 1, highlights={})
            for i, doc_id in enumerate(self._ordered_ids[key])
        ]

    def get_all_ids(self) -> list[int]:
        """Return all matching document IDs in display order."""
        return self._ordered_ids


class SearchIndexLockError(Exception):
    """Raised when the search index file lock cannot be acquired within the timeout."""


class WriteBatch:
    """
    Context manager for bulk index operations with file locking.

    Provides transactional batch updates to the search index with proper
    concurrency control via file locking. All operations within the batch
    are committed atomically or rolled back on exception.

    Usage:
        with backend.batch_update() as batch:
            batch.add_or_update(document)
            batch.remove(doc_id)
    """

    def __init__(self, backend: TantivyBackend, lock_timeout: float):
        self._backend = backend
        self._lock_timeout = lock_timeout
        self._writer = None
        self._lock = None

    def __enter__(self) -> Self:
        if self._backend._path is not None:
            lock_path = self._backend._path / ".tantivy.lock"
            self._lock = filelock.FileLock(str(lock_path))
            try:
                self._lock.acquire(timeout=self._lock_timeout)
            except filelock.Timeout as e:  # pragma: no cover
                raise SearchIndexLockError(
                    f"Could not acquire index lock within {self._lock_timeout}s",
                ) from e

        self._writer = self._backend._index.writer()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        try:
            if exc_type is None:
                self._writer.commit()
                self._backend._index.reload()
            # Explicitly delete writer to release tantivy's internal lock.
            # On exception the uncommitted writer is simply discarded.
            if self._writer is not None:
                del self._writer
                self._writer = None
        finally:
            if self._lock is not None:
                self._lock.release()

    def add_or_update(
        self,
        document: Document,
        effective_content: str | None = None,
    ) -> None:
        """
        Add or update a document in the batch.

        Implements upsert behavior by deleting any existing document with the same ID
        and adding the new version. This ensures stale document data (e.g., after
        permission changes) doesn't persist in the index.

        Args:
            document: Django Document instance to index
            effective_content: Override document.content for indexing (used when
                re-indexing with newer OCR text from document versions)
        """
        self.remove(document.pk)
        doc = self._backend._build_tantivy_doc(document, effective_content)
        self._writer.add_document(doc)

    def remove(self, doc_id: int) -> None:
        """
        Remove a document from the batch by its primary key.

        Uses range query instead of term query to work around unsigned integer
        type detection bug in tantivy-py 0.25.
        """
        # Use range query to work around u64 deletion bug
        self._writer.delete_documents_by_query(
            tantivy.Query.range_query(
                self._backend._schema,
                "id",
                tantivy.FieldType.Unsigned,
                doc_id,
                doc_id,
            ),
        )


class TantivyBackend:
    """
    Tantivy search backend with explicit lifecycle management.

    Provides full-text search capabilities using the Tantivy search engine.
    Supports in-memory indexes (for testing) and persistent on-disk indexes
    (for production use). Handles document indexing, search queries, autocompletion,
    and "more like this" functionality.

    The backend manages its own connection lifecycle and can be reset when
    the underlying index directory changes (e.g., during test isolation).
    """

    # Maps DRF ordering field names to Tantivy index field names.
    SORT_FIELD_MAP: dict[str, str] = {
        "title": "title_sort",
        "correspondent__name": "correspondent_sort",
        "document_type__name": "type_sort",
        "created": "created",
        "added": "added",
        "modified": "modified",
        "archive_serial_number": "asn",
        "page_count": "page_count",
        "num_notes": "num_notes",
    }

    # Fields where Tantivy's sort order matches the ORM's sort order.
    # Text-based fields (title, correspondent__name, document_type__name)
    # are excluded because Tantivy's tokenized fast fields produce different
    # ordering than the ORM's collation-based ordering.
    SORTABLE_FIELDS: frozenset[str] = frozenset(
        {
            "created",
            "added",
            "modified",
            "archive_serial_number",
            "page_count",
            "num_notes",
        },
    )

    def __init__(self, path: Path | None = None):
        # path=None → in-memory index (for tests)
        # path=some_dir → on-disk index (for production)
        self._path = path
        self._index = None
        self._schema = None

    def open(self) -> None:
        """
        Open or rebuild the index as needed.

        For disk-based indexes, checks if rebuilding is needed due to schema
        version or language changes. Registers custom tokenizers after opening.
        Safe to call multiple times - subsequent calls are no-ops.
        """
        if self._index is not None:
            return  # pragma: no cover
        if self._path is not None:
            self._index = open_or_rebuild_index(self._path)
        else:
            self._index = tantivy.Index(build_schema())
        register_tokenizers(self._index, settings.SEARCH_LANGUAGE)
        self._schema = self._index.schema

    def close(self) -> None:
        """
        Close the index and release resources.

        Safe to call multiple times - subsequent calls are no-ops.
        """
        self._index = None
        self._schema = None

    def _ensure_open(self) -> None:
        """Ensure the index is open before operations."""
        if self._index is None:
            self.open()  # pragma: no cover

    def _parse_query(
        self,
        query: str,
        search_mode: SearchMode,
    ) -> tantivy.Query:
        """Parse a user query string into a Tantivy Query object."""
        tz = get_current_timezone()
        if search_mode is SearchMode.TEXT:
            return parse_simple_text_query(self._index, query)
        elif search_mode is SearchMode.TITLE:
            return parse_simple_title_query(self._index, query)
        else:
            return parse_user_query(self._index, query, tz)

    def _apply_permission_filter(
        self,
        query: tantivy.Query,
        user: AbstractBaseUser | None,
    ) -> tantivy.Query:
        """Wrap a query with a permission filter if the user is not a superuser."""
        if user is not None:
            permission_filter = build_permission_filter(self._schema, user)
            return tantivy.Query.boolean_query(
                [
                    (tantivy.Occur.Must, query),
                    (tantivy.Occur.Must, permission_filter),
                ],
            )
        return query

    def _build_tantivy_doc(
        self,
        document: Document,
        effective_content: str | None = None,
    ) -> tantivy.Document:
        """Build a tantivy Document from a Django Document instance.

        ``effective_content`` overrides ``document.content`` for indexing —
        used when re-indexing a root document with a newer version's OCR text.
        """
        content = (
            effective_content if effective_content is not None else document.content
        )

        doc = tantivy.Document()

        # Basic fields
        doc.add_unsigned("id", document.pk)
        doc.add_text("checksum", document.checksum)
        doc.add_text("title", document.title)
        doc.add_text("title_sort", document.title)
        doc.add_text("simple_title", document.title)
        doc.add_text("content", content)
        doc.add_text("bigram_content", content)
        doc.add_text("simple_content", content)

        # Original filename - only add if not None/empty
        if document.original_filename:
            doc.add_text("original_filename", document.original_filename)

        # Correspondent
        if document.correspondent:
            doc.add_text("correspondent", document.correspondent.name)
            doc.add_text("correspondent_sort", document.correspondent.name)
            doc.add_unsigned("correspondent_id", document.correspondent_id)

        # Document type
        if document.document_type:
            doc.add_text("document_type", document.document_type.name)
            doc.add_text("type_sort", document.document_type.name)
            doc.add_unsigned("document_type_id", document.document_type_id)

        # Storage path
        if document.storage_path:
            doc.add_text("storage_path", document.storage_path.name)
            doc.add_unsigned("storage_path_id", document.storage_path_id)

        # Tags — collect names for autocomplete in the same pass
        tag_names: list[str] = []
        for tag in document.tags.all():
            doc.add_text("tag", tag.name)
            doc.add_unsigned("tag_id", tag.pk)
            tag_names.append(tag.name)

        # Notes — JSON for structured queries (notes.user:alice, notes.note:text),
        # companion text field for default full-text search.
        num_notes = 0
        for note in document.notes.all():
            num_notes += 1
            doc.add_json("notes", {"note": note.note, "user": note.user.username})

        # Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y),
        # companion text field for default full-text search.
        for cfi in document.custom_fields.all():
            search_value = cfi.value_for_search
            # Skip fields where there is no value yet
            if search_value is None:
                continue
            doc.add_json(
                "custom_fields",
                {
                    "name": cfi.field.name,
                    "value": search_value,
                },
            )

        # Dates
        created_date = datetime(
            document.created.year,
            document.created.month,
            document.created.day,
            tzinfo=UTC,
        )
        doc.add_date("created", created_date)
        doc.add_date("modified", document.modified)
        doc.add_date("added", document.added)

        if document.archive_serial_number is not None:
            doc.add_unsigned("asn", document.archive_serial_number)

        if document.page_count is not None:
            doc.add_unsigned("page_count", document.page_count)

        doc.add_unsigned("num_notes", num_notes)

        # Owner
        if document.owner_id:
            doc.add_unsigned("owner_id", document.owner_id)

        # Viewers with permission
        users_with_perms = get_users_with_perms(
            document,
            only_with_perms_in=["view_document"],
        )
        for user in users_with_perms:
            doc.add_unsigned("viewer_id", user.pk)

        # Autocomplete words
        text_sources = [document.title, content]
        if document.correspondent:
            text_sources.append(document.correspondent.name)
        if document.document_type:
            text_sources.append(document.document_type.name)
        text_sources.extend(tag_names)

        for word in sorted(_extract_autocomplete_words(text_sources)):
            doc.add_text("autocomplete_word", word)

        return doc

    def add_or_update(
        self,
        document: Document,
        effective_content: str | None = None,
    ) -> None:
        """
        Add or update a single document with file locking.

        Convenience method for single-document updates. For bulk operations,
        use batch_update() context manager for better performance.

        Args:
            document: Django Document instance to index
            effective_content: Override document.content for indexing
        """
        self._ensure_open()
        with self.batch_update(lock_timeout=5.0) as batch:
            batch.add_or_update(document, effective_content)

    def remove(self, doc_id: int) -> None:
        """
        Remove a single document from the index with file locking.

        Convenience method for single-document removal. For bulk operations,
        use batch_update() context manager for better performance.

        Args:
            doc_id: Primary key of the document to remove
        """
        self._ensure_open()
        with self.batch_update(lock_timeout=5.0) as batch:
            batch.remove(doc_id)

    def search(
        self,
        query: str,
        user: AbstractBaseUser | None,
        page: int,
        page_size: int,
        sort_field: str | None,
        *,
        sort_reverse: bool,
        search_mode: SearchMode = SearchMode.QUERY,
        highlight_page: int | None = None,
        highlight_page_size: int | None = None,
    ) -> SearchResults:
        """
        Execute a search query against the document index.

        Processes the user query through date rewriting, normalization, and
        permission filtering before executing against Tantivy. Supports both
        relevance-based and field-based sorting.

        QUERY search mode supports natural date keywords, field filters, etc.
        TITLE search mode treats the query as plain text to search for in title only
        TEXT search mode treats the query as plain text to search for in title and content

        Args:
            query: User's search query
            user: User for permission filtering (None for superuser/no filtering)
            page: Page number (1-indexed) for pagination
            page_size: Number of results per page
            sort_field: Field to sort by (None for relevance ranking)
            sort_reverse: Whether to reverse the sort order
            search_mode: "query" for advanced Tantivy syntax, "text" for
                plain-text search over title and content only, "title" for
                plain-text search over title only

        Returns:
            SearchResults with hits, total count, and processed query
        """
        self._ensure_open()
        user_query = self._parse_query(query, search_mode)
        final_query = self._apply_permission_filter(user_query, user)

        searcher = self._index.searcher()
        offset = (page - 1) * page_size

        # Perform search
        if sort_field and sort_field in self.SORT_FIELD_MAP:
            mapped_field = self.SORT_FIELD_MAP[sort_field]
            results = searcher.search(
                final_query,
                limit=offset + page_size,
                order_by_field=mapped_field,
                order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
            )
            # Field sorting: hits are still (score, DocAddress) tuples; score unused
            all_hits = [(hit[1], 0.0) for hit in results.hits]
        else:
            # Score-based search: hits are (score, DocAddress) tuples
            results = searcher.search(final_query, limit=offset + page_size)
            all_hits = [(hit[1], hit[0]) for hit in results.hits]

        total = results.count

        # Normalize scores for score-based searches
        if not sort_field and all_hits:
            max_score = max(hit[1] for hit in all_hits) or 1.0
            all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]

        # Apply threshold filter if configured (score-based search only)
        threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
        if threshold is not None and not sort_field:
            all_hits = [hit for hit in all_hits if hit[1] >= threshold]

        # Get the page's hits
        page_hits = all_hits[offset : offset + page_size]

        # Build result hits with highlights
        hits: list[SearchHit] = []
        snippet_generator = None
        notes_snippet_generator = None

        # Determine which hits need highlights
        if highlight_page is not None and highlight_page_size is not None:
            hl_start = (highlight_page - 1) * highlight_page_size
            hl_end = hl_start + highlight_page_size
        else:
            # Highlight all hits (backward-compatible default)
            hl_start = 0
            hl_end = len(page_hits)

        for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
            # Get the actual document from the searcher using the doc address
            actual_doc = searcher.doc(doc_address)
            doc_dict = actual_doc.to_dict()
            doc_id = doc_dict["id"][0]

            highlights: dict[str, str] = {}

            # Generate highlights if score > 0 and hit is in the highlight window
            hit_index = rank - offset - 1  # 0-based index within page_hits
            if score > 0 and hl_start <= hit_index < hl_end:
                try:
                    if snippet_generator is None:
                        snippet_generator = tantivy.SnippetGenerator.create(
                            searcher,
                            final_query,
                            self._schema,
                            "content",
                        )

                    content_snippet = snippet_generator.snippet_from_doc(actual_doc)
                    if content_snippet:
                        highlights["content"] = str(content_snippet)

                    # Try notes highlights
                    if "notes" in doc_dict:
                        if notes_snippet_generator is None:
                            notes_snippet_generator = tantivy.SnippetGenerator.create(
                                searcher,
                                final_query,
                                self._schema,
                                "notes",
                            )
                        notes_snippet = notes_snippet_generator.snippet_from_doc(
                            actual_doc,
                        )
                        if notes_snippet:
                            highlights["notes"] = str(notes_snippet)

                except Exception:  # pragma: no cover
                    logger.debug("Failed to generate highlights for doc %s", doc_id)

            hits.append(
                SearchHit(
                    id=doc_id,
                    score=score,
                    rank=rank,
                    highlights=highlights,
                ),
            )

        return SearchResults(
            hits=hits,
            total=total,
            query=query,
        )

    def highlight_hits(
        self,
        query: str,
        doc_ids: list[int],
        *,
        search_mode: SearchMode = SearchMode.QUERY,
    ) -> list[SearchHit]:
        """
        Generate SearchHit dicts with highlights for specific document IDs.

        Unlike search(), this does not execute a ranked query — it looks up
        each document by ID and generates snippets against the provided query.
        Use this when you already know which documents to display (from
        search_ids + ORM filtering) and just need highlight data.

        Note: Each doc_id requires an individual index lookup because tantivy-py
        does not expose a batch doc-address-by-ID API. This is acceptable for
        page-sized batches (typically 25 docs) but should not be called with
        thousands of IDs.

        Args:
            query: The search query (used for snippet generation)
            doc_ids: Ordered list of document IDs to generate hits for
            search_mode: Query parsing mode (for building the snippet query)

        Returns:
            List of SearchHit dicts in the same order as doc_ids
        """
        if not doc_ids:
            return []

        self._ensure_open()
        user_query = self._parse_query(query, search_mode)

        searcher = self._index.searcher()
        snippet_generator = None
        hits: list[SearchHit] = []

        for rank, doc_id in enumerate(doc_ids, start=1):
            # Look up document by ID
            id_query = tantivy.Query.range_query(
                self._schema,
                "id",
                tantivy.FieldType.Unsigned,
                doc_id,
                doc_id,
            )
            results = searcher.search(id_query, limit=1)

            if not results.hits:
                continue

            doc_address = results.hits[0][1]
            actual_doc = searcher.doc(doc_address)
            doc_dict = actual_doc.to_dict()

            highlights: dict[str, str] = {}
            try:
                if snippet_generator is None:
                    snippet_generator = tantivy.SnippetGenerator.create(
                        searcher,
                        user_query,
                        self._schema,
                        "content",
                    )

                content_snippet = snippet_generator.snippet_from_doc(actual_doc)
                if content_snippet:
                    highlights["content"] = str(content_snippet)

                if "notes" in doc_dict:
                    notes_generator = tantivy.SnippetGenerator.create(
                        searcher,
                        user_query,
                        self._schema,
                        "notes",
                    )
                    notes_snippet = notes_generator.snippet_from_doc(actual_doc)
                    if notes_snippet:
                        highlights["notes"] = str(notes_snippet)

            except Exception:  # pragma: no cover
                logger.debug("Failed to generate highlights for doc %s", doc_id)

            hits.append(
                SearchHit(
                    id=doc_id,
                    score=0.0,
                    rank=rank,
                    highlights=highlights,
                ),
            )

        return hits

    def search_ids(
        self,
        query: str,
        user: AbstractBaseUser | None,
        *,
        sort_field: str | None = None,
        sort_reverse: bool = False,
        search_mode: SearchMode = SearchMode.QUERY,
        limit: int | None = None,
    ) -> list[int]:
        """
        Return document IDs matching a query — no highlights or scores.

        This is the lightweight companion to search(). Use it when you need the
        full set of matching IDs (e.g. for ``selection_data``) but don't need
        scores, ranks, or highlights.

        Args:
            query: User's search query
            user: User for permission filtering (None for superuser/no filtering)
            sort_field: Field to sort by (None for relevance ranking)
            sort_reverse: Whether to reverse the sort order
            search_mode: Query parsing mode (QUERY, TEXT, or TITLE)
            limit: Maximum number of IDs to return (None = all matching docs)

        Returns:
            List of document IDs in the requested order
        """
        self._ensure_open()
        user_query = self._parse_query(query, search_mode)
        final_query = self._apply_permission_filter(user_query, user)

        searcher = self._index.searcher()
        effective_limit = limit if limit is not None else searcher.num_docs

        if sort_field and sort_field in self.SORT_FIELD_MAP:
            mapped_field = self.SORT_FIELD_MAP[sort_field]
            results = searcher.search(
                final_query,
                limit=effective_limit,
                order_by_field=mapped_field,
                order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
            )
            all_hits = [(hit[1],) for hit in results.hits]
        else:
            results = searcher.search(final_query, limit=effective_limit)
            all_hits = [(hit[1], hit[0]) for hit in results.hits]

            # Normalize scores and apply threshold (relevance search only)
            if all_hits:
                max_score = max(hit[1] for hit in all_hits) or 1.0
                all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]

            threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
            if threshold is not None:
                all_hits = [hit for hit in all_hits if hit[1] >= threshold]

        return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]

    def autocomplete(
        self,
        term: str,
        limit: int,
        user: AbstractBaseUser | None = None,
    ) -> list[str]:
        """
        Get autocomplete suggestions for search queries.

        Returns words that start with the given term prefix, ranked by document
        frequency (how many documents contain each word). Optionally filters
        results to only words from documents visible to the specified user.

        Args:
            term: Prefix to match against autocomplete words
            limit: Maximum number of suggestions to return
            user: User for permission filtering (None for no filtering)

        Returns:
            List of word suggestions ordered by frequency, then alphabetically
        """
        self._ensure_open()
        normalized_term = ascii_fold(term.lower())

        searcher = self._index.searcher()

        # Apply permission filter for non-superusers so autocomplete words
        # from invisible documents don't leak to other users.
        if user is not None and not user.is_superuser:
            base_query = build_permission_filter(self._schema, user)
        else:
            base_query = tantivy.Query.all_query()

        results = searcher.search(base_query, limit=searcher.num_docs)

        # Count how many visible documents each word appears in.
        # Using Counter (not set) preserves per-word document frequency so
        # we can rank suggestions by how commonly they occur — the same
        # signal Whoosh used for Tf/Idf-based autocomplete ordering.
        word_counts: Counter[str] = Counter()
        for _score, doc_address in results.hits:
            stored_doc = searcher.doc(doc_address)
            doc_dict = stored_doc.to_dict()
            if "autocomplete_word" in doc_dict:
                word_counts.update(doc_dict["autocomplete_word"])

        # Filter to prefix matches, sort by document frequency descending;
        # break ties alphabetically for stable, deterministic output.
        matches = sorted(
            (w for w in word_counts if w.startswith(normalized_term)),
            key=lambda w: (-word_counts[w], w),
        )

        return matches[:limit]

    def more_like_this(
        self,
        doc_id: int,
        user: AbstractBaseUser | None,
        page: int,
        page_size: int,
    ) -> SearchResults:
        """
        Find documents similar to the given document using content analysis.

        Uses Tantivy's "more like this" query to find documents with similar
        content patterns. The original document is excluded from results.

        Args:
            doc_id: Primary key of the reference document
            user: User for permission filtering (None for no filtering)
            page: Page number (1-indexed) for pagination
            page_size: Number of results per page

        Returns:
            SearchResults with similar documents (excluding the original)
        """
        self._ensure_open()
        searcher = self._index.searcher()

        # First find the document address
        id_query = tantivy.Query.range_query(
            self._schema,
            "id",
            tantivy.FieldType.Unsigned,
            doc_id,
            doc_id,
        )
        results = searcher.search(id_query, limit=1)

        if not results.hits:
            # Document not found
            return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}")

        # Extract doc_address from (score, doc_address) tuple
        doc_address = results.hits[0][1]

        # Build more like this query
        mlt_query = tantivy.Query.more_like_this_query(
            doc_address,
            min_doc_frequency=1,
            max_doc_frequency=None,
            min_term_frequency=1,
            max_query_terms=12,
            min_word_length=None,
            max_word_length=None,
            boost_factor=None,
        )

        final_query = self._apply_permission_filter(mlt_query, user)

        # Search
        offset = (page - 1) * page_size
        results = searcher.search(final_query, limit=offset + page_size)

        total = results.count
        # Convert from (score, doc_address) to (doc_address, score)
        all_hits = [(hit[1], hit[0]) for hit in results.hits]

        # Normalize scores
        if all_hits:
            max_score = max(hit[1] for hit in all_hits) or 1.0
            all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]

        # Get page hits
        page_hits = all_hits[offset : offset + page_size]

        # Build results
        hits: list[SearchHit] = []
        for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
            actual_doc = searcher.doc(doc_address)
            doc_dict = actual_doc.to_dict()
            result_doc_id = doc_dict["id"][0]

            # Skip the original document
            if result_doc_id == doc_id:
                continue

            hits.append(
                SearchHit(
                    id=result_doc_id,
                    score=score,
                    rank=rank,
                    highlights={},  # MLT doesn't generate highlights
                ),
            )

        return SearchResults(
            hits=hits,
            total=max(0, total - 1),  # Subtract 1 for the original document
            query=f"more_like:{doc_id}",
        )

    def more_like_this_ids(
        self,
        doc_id: int,
        user: AbstractBaseUser | None,
        *,
        limit: int | None = None,
    ) -> list[int]:
        """
        Return IDs of documents similar to the given document — no highlights.

        Lightweight companion to more_like_this(). The original document is
        excluded from results.

        Args:
            doc_id: Primary key of the reference document
            user: User for permission filtering (None for no filtering)
            limit: Maximum number of IDs to return (None = all matching docs)

        Returns:
            List of similar document IDs (excluding the original)
        """
        self._ensure_open()
        searcher = self._index.searcher()

        id_query = tantivy.Query.range_query(
            self._schema,
            "id",
            tantivy.FieldType.Unsigned,
            doc_id,
            doc_id,
        )
        results = searcher.search(id_query, limit=1)

        if not results.hits:
            return []

        doc_address = results.hits[0][1]
        mlt_query = tantivy.Query.more_like_this_query(
            doc_address,
            min_doc_frequency=1,
            max_doc_frequency=None,
            min_term_frequency=1,
            max_query_terms=12,
            min_word_length=None,
            max_word_length=None,
            boost_factor=None,
        )

        final_query = self._apply_permission_filter(mlt_query, user)

        effective_limit = limit if limit is not None else searcher.num_docs
        results = searcher.search(final_query, limit=effective_limit)

        ids = []
        for _score, doc_address in results.hits:
            result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
            if result_doc_id != doc_id:
                ids.append(result_doc_id)
        return ids

    def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
        """
        Get a batch context manager for bulk index operations.

        Use this for efficient bulk document updates/deletions. All operations
        within the batch are committed atomically at the end of the context.

        Args:
            lock_timeout: Seconds to wait for file lock acquisition

        Returns:
            WriteBatch context manager

        Raises:
            SearchIndexLockError: If lock cannot be acquired within timeout
        """
        self._ensure_open()
        return WriteBatch(self, lock_timeout)

    def rebuild(
        self,
        documents: QuerySet[Document],
        iter_wrapper: IterWrapper[Document] = identity,
    ) -> None:
        """
        Rebuild the entire search index from scratch.

        Wipes the existing index and re-indexes all provided documents.
        On failure, restores the previous index state to keep the backend usable.

        Args:
            documents: QuerySet of Document instances to index
            iter_wrapper: Optional wrapper function for progress tracking
                (e.g., progress bar). Should yield each document unchanged.
        """
        # Create new index (on-disk or in-memory)
        if self._path is not None:
            wipe_index(self._path)
            new_index = tantivy.Index(build_schema(), path=str(self._path))
            _write_sentinels(self._path)
        else:
            new_index = tantivy.Index(build_schema())
        register_tokenizers(new_index, settings.SEARCH_LANGUAGE)

        # Point instance at the new index so _build_tantivy_doc uses it
        old_index, old_schema = self._index, self._schema
        self._index = new_index
        self._schema = new_index.schema

        try:
            writer = new_index.writer()
            for document in iter_wrapper(documents):
                doc = self._build_tantivy_doc(
                    document,
                    document.get_effective_content(),
                )
                writer.add_document(doc)
            writer.commit()
            new_index.reload()
        except BaseException:  # pragma: no cover
            # Restore old index on failure so the backend remains usable
            self._index = old_index
            self._schema = old_schema
            raise


# Module-level singleton with proper thread safety
_backend: TantivyBackend | None = None
_backend_path: Path | None = None  # tracks which INDEX_DIR the singleton uses
_backend_lock = threading.RLock()


def get_backend() -> TantivyBackend:
    """
    Get the global backend instance with thread safety.

    Returns a singleton TantivyBackend instance, automatically reinitializing
    when settings.INDEX_DIR changes. This ensures proper test isolation when
    using pytest-xdist or @override_settings that change the index directory.

    Returns:
        Thread-safe singleton TantivyBackend instance
    """
    global _backend, _backend_path

    current_path: Path = settings.INDEX_DIR

    # Fast path: backend is initialized and path hasn't changed (no lock needed)
    if _backend is not None and _backend_path == current_path:
        return _backend

    # Slow path: first call, or INDEX_DIR changed between calls
    with _backend_lock:
        # Double-check after acquiring lock — another thread may have beaten us
        if _backend is not None and _backend_path == current_path:
            return _backend  # pragma: no cover

        if _backend is not None:
            _backend.close()

        _backend = TantivyBackend(path=current_path)
        _backend.open()
        _backend_path = current_path

        return _backend


def reset_backend() -> None:
    """
    Reset the global backend instance with thread safety.

    Forces creation of a new backend instance on the next get_backend() call.
    Used for test isolation and when switching between different index directories.
    """
    global _backend, _backend_path

    with _backend_lock:
        if _backend is not None:
            _backend.close()
        _backend = None
        _backend_path = None