docs: Enhance docstrings and test quality for Tantivy search backend

- Add comprehensive docstrings to all public methods and classes in the search package - Clarify purpose, parameters, return values, and implementation notes - Document thread safety, error handling, and usage patterns - Explain Tantivy-specific workarounds and design decisions - Improve test quality and pytest compliance - Add descriptive comments explaining what each test verifies - Convert TestIndexOptimize to pytest style with @pytest.mark.django_db - Ensure all test docstrings focus on behavior verification rather than implementation - Maintain existing functionality while improving code documentation - No changes to production logic or test coverage - All tests continue to pass with enhanced clarity Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 11:05:28 +00:00 · 2026-03-30 15:54:18 -07:00
parent 12eb9b9abf
commit e7f68c2082
9 changed files with 342 additions and 78 deletions
@@ -14,6 +14,13 @@ logger = logging.getLogger("paperless.management.document_index")


 class Command(PaperlessCommand):
+    """
+    Django management command for search index operations.
+
+    Provides subcommands for reindexing documents and optimizing the search index.
+    Supports conditional reindexing based on schema version and language changes.
+    """
+
    help = "Manages the document index."

    supports_progress_bar = True
@@ -46,12 +46,17 @@ T = TypeVar("T")


 def _identity(iterable: Iterable[T]) -> Iterable[T]:
-    """Default iter_wrapper that passes through unchanged."""
+    """Default iter_wrapper that passes documents through unchanged for indexing."""
    return iterable


 def _ascii_fold(s: str) -> str:
-    """Normalize unicode to ASCII equivalent characters."""
+    """
+    Normalize unicode to ASCII equivalent characters for search consistency.
+
+    Converts accented characters (e.g., "café") to their ASCII base forms ("cafe")
+    to enable cross-language searching without requiring exact diacritic matching.
+    """
    return unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode()


@@ -91,17 +96,33 @@ class SearchHit(TypedDict):

@dataclass(frozen=True, slots=True)
 class SearchResults:
+    """
+    Container for search results with pagination metadata.
+
+    Attributes:
+        hits: List of search results with scores and highlights
+        total: Total matching documents across all pages (for pagination)
+        query: Preprocessed query string after date/syntax rewriting
+    """
+
    hits: list[SearchHit]
    total: int  # total matching documents (for pagination)
    query: str  # preprocessed query string


 class TantivyRelevanceList:
-    """DRF-compatible list wrapper for Tantivy search hits.
+    """
+    DRF-compatible list wrapper for Tantivy search hits.

-    __len__ returns the total hit count (for pagination); __getitem__ slices
-    the hit list.  Stores ALL post-filter hits so that get_all_result_ids()
-    can return every matching doc ID without a second query.
+    Provides paginated access to search results while storing all hits in memory
+    for efficient ID retrieval. Used by Django REST framework for pagination.
+
+    Methods:
+        __len__: Returns total hit count for pagination calculations
+        __getitem__: Slices the hit list for page-specific results
+
+    Note: Stores ALL post-filter hits so get_all_result_ids() can return
+    every matching document ID without requiring a second search query.
    """

    def __init__(self, hits: list[SearchHit]) -> None:
@@ -115,11 +136,22 @@ class TantivyRelevanceList:


 class SearchIndexLockError(Exception):
-    pass
+    """Raised when the search index file lock cannot be acquired within the timeout."""


 class WriteBatch:
-    """Context manager for bulk index operations with file locking."""
+    """
+    Context manager for bulk index operations with file locking.
+
+    Provides transactional batch updates to the search index with proper
+    concurrency control via file locking. All operations within the batch
+    are committed atomically or rolled back on exception.
+
+    Usage:
+        with backend.batch_update() as batch:
+            batch.add_or_update(document)
+            batch.remove(doc_id)
+    """

    def __init__(self, backend: TantivyBackend, lock_timeout: float):
        self._backend = backend
@@ -160,18 +192,29 @@ class WriteBatch:
        document: Document,
        effective_content: str | None = None,
    ) -> None:
-        """Add or update a document in the batch.
+        """
+        Add or update a document in the batch.

-        Tantivy has no native upsert — we delete by id then re-add so
-        stale copies (e.g. after a permission change) don't linger.
-        ``effective_content`` overrides ``document.content`` for indexing.
+        Implements upsert behavior by deleting any existing document with the same ID
+        and adding the new version. This ensures stale document data (e.g., after
+        permission changes) doesn't persist in the index.
+
+        Args:
+            document: Django Document instance to index
+            effective_content: Override document.content for indexing (used when
+                re-indexing with newer OCR text from document versions)
        """
        self.remove(document.pk)
        doc = self._backend._build_tantivy_doc(document, effective_content)
        self._writer.add_document(doc)

    def remove(self, doc_id: int) -> None:
-        """Remove a document from the batch."""
+        """
+        Remove a document from the batch by its primary key.
+
+        Uses range query instead of term query to work around unsigned integer
+        type detection bug in tantivy-py 0.25.
+        """
        # Use range query to work around u64 deletion bug
        self._writer.delete_documents_by_query(
            tantivy.Query.range_query(
@@ -185,7 +228,17 @@ class WriteBatch:


 class TantivyBackend:
-    """Tantivy search backend with explicit lifecycle management."""
+    """
+    Tantivy search backend with explicit lifecycle management.
+
+    Provides full-text search capabilities using the Tantivy search engine.
+    Supports in-memory indexes (for testing) and persistent on-disk indexes
+    (for production use). Handles document indexing, search queries, autocompletion,
+    and "more like this" functionality.
+
+    The backend manages its own connection lifecycle and can be reset when
+    the underlying index directory changes (e.g., during test isolation).
+    """

    def __init__(self, path: Path | None = None):
        # path=None → in-memory index (for tests)
@@ -195,7 +248,13 @@ class TantivyBackend:
        self._schema = None

    def open(self) -> None:
-        """Open or rebuild the index. Idempotent."""
+        """
+        Open or rebuild the index as needed.
+
+        For disk-based indexes, checks if rebuilding is needed due to schema
+        version or language changes. Registers custom tokenizers after opening.
+        Safe to call multiple times - subsequent calls are no-ops.
+        """
        if self._index is not None:
            return
        if self._path is not None:
@@ -206,7 +265,11 @@ class TantivyBackend:
        self._schema = self._index.schema

    def close(self) -> None:
-        """Close the index. Idempotent."""
+        """
+        Close the index and release resources.
+
+        Safe to call multiple times - subsequent calls are no-ops.
+        """
        self._index = None
        self._schema = None

@@ -339,13 +402,30 @@ class TantivyBackend:
        document: Document,
        effective_content: str | None = None,
    ) -> None:
-        """Add or update a single document with file locking."""
+        """
+        Add or update a single document with file locking.
+
+        Convenience method for single-document updates. For bulk operations,
+        use batch_update() context manager for better performance.
+
+        Args:
+            document: Django Document instance to index
+            effective_content: Override document.content for indexing
+        """
        self._ensure_open()
        with self.batch_update(lock_timeout=5.0) as batch:
            batch.add_or_update(document, effective_content)

    def remove(self, doc_id: int) -> None:
-        """Remove a single document with file locking."""
+        """
+        Remove a single document from the index with file locking.
+
+        Convenience method for single-document removal. For bulk operations,
+        use batch_update() context manager for better performance.
+
+        Args:
+            doc_id: Primary key of the document to remove
+        """
        self._ensure_open()
        with self.batch_update(lock_timeout=5.0) as batch:
            batch.remove(doc_id)
@@ -360,7 +440,24 @@ class TantivyBackend:
        *,
        sort_reverse: bool,
    ) -> SearchResults:
-        """Search the index."""
+        """
+        Execute a search query against the document index.
+
+        Processes the user query through date rewriting, normalization, and
+        permission filtering before executing against Tantivy. Supports both
+        relevance-based and field-based sorting.
+
+        Args:
+            query: User's search query (supports natural date keywords, field filters)
+            user: User for permission filtering (None for superuser/no filtering)
+            page: Page number (1-indexed) for pagination
+            page_size: Number of results per page
+            sort_field: Field to sort by (None for relevance ranking)
+            sort_reverse: Whether to reverse the sort order
+
+        Returns:
+            SearchResults with hits, total count, and processed query
+        """
        self._ensure_open()
        tz = get_current_timezone()
        user_query = parse_user_query(self._index, query, tz)
@@ -491,7 +588,21 @@ class TantivyBackend:
        limit: int,
        user: AbstractBaseUser | None = None,
    ) -> list[str]:
-        """Get autocomplete suggestions, optionally filtered by user visibility."""
+        """
+        Get autocomplete suggestions for search queries.
+
+        Returns words that start with the given term prefix, ranked by document
+        frequency (how many documents contain each word). Optionally filters
+        results to only words from documents visible to the specified user.
+
+        Args:
+            term: Prefix to match against autocomplete words
+            limit: Maximum number of suggestions to return
+            user: User for permission filtering (None for no filtering)
+
+        Returns:
+            List of word suggestions ordered by frequency, then alphabetically
+        """
        self._ensure_open()
        normalized_term = _ascii_fold(term.lower())

@@ -533,7 +644,21 @@ class TantivyBackend:
        page: int,
        page_size: int,
    ) -> SearchResults:
-        """Find documents similar to the given document."""
+        """
+        Find documents similar to the given document using content analysis.
+
+        Uses Tantivy's "more like this" query to find documents with similar
+        content patterns. The original document is excluded from results.
+
+        Args:
+            doc_id: Primary key of the reference document
+            user: User for permission filtering (None for no filtering)
+            page: Page number (1-indexed) for pagination
+            page_size: Number of results per page
+
+        Returns:
+            SearchResults with similar documents (excluding the original)
+        """
        self._ensure_open()
        searcher = self._index.searcher()

@@ -621,12 +746,36 @@ class TantivyBackend:
        )

    def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
-        """Get a batch context manager for bulk operations."""
+        """
+        Get a batch context manager for bulk index operations.
+
+        Use this for efficient bulk document updates/deletions. All operations
+        within the batch are committed atomically at the end of the context.
+
+        Args:
+            lock_timeout: Seconds to wait for file lock acquisition
+
+        Returns:
+            WriteBatch context manager
+
+        Raises:
+            SearchIndexLockError: If lock cannot be acquired within timeout
+        """
        self._ensure_open()
        return WriteBatch(self, lock_timeout)

    def rebuild(self, documents: QuerySet, iter_wrapper: Callable = _identity) -> None:
-        """Rebuild the entire search index."""
+        """
+        Rebuild the entire search index from scratch.
+
+        Wipes the existing index and re-indexes all provided documents.
+        On failure, restores the previous index state to keep the backend usable.
+
+        Args:
+            documents: QuerySet of Document instances to index
+            iter_wrapper: Optional wrapper function for progress tracking
+                (e.g., progress bar). Should yield each document unchanged.
+        """
        # Create new index (on-disk or in-memory)
        if self._path is not None:
            wipe_index(self._path)
@@ -662,11 +811,15 @@ _backend_lock = threading.RLock()


 def get_backend() -> TantivyBackend:
-    """Get the global backend instance with thread safety.
+    """
+    Get the global backend instance with thread safety.

-    Automatically reinitializes when settings.INDEX_DIR changes — this fixes
-    the xdist/override_settings isolation issue where each test may set a
-    different INDEX_DIR but would otherwise share a stale singleton.
+    Returns a singleton TantivyBackend instance, automatically reinitializing
+    when settings.INDEX_DIR changes. This ensures proper test isolation when
+    using pytest-xdist or @override_settings that change the index directory.
+
+    Returns:
+        Thread-safe singleton TantivyBackend instance
    """
    global _backend, _backend_path

@@ -693,7 +846,12 @@ def get_backend() -> TantivyBackend:


 def reset_backend() -> None:
-    """Reset the global backend instance with thread safety."""
+    """
+    Reset the global backend instance with thread safety.
+
+    Forces creation of a new backend instance on the next get_backend() call.
+    Used for test isolation and when switching between different index directories.
+    """
    global _backend, _backend_path

    with _backend_lock:
@@ -273,9 +273,24 @@ def _rewrite_8digit_date(query: str, tz: tzinfo) -> str:

 def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
    """
-    Preprocessing stage 1: rewrite Whoosh compact dates, relative ranges,
-    and natural date keywords (field:today etc.) to ISO 8601.
-    Bare keywords without a field: prefix pass through unchanged.
+    Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility.
+
+    Performs the first stage of query preprocessing, converting various date
+    formats and keywords to ISO 8601 datetime ranges that Tantivy can parse:
+    - Compact 14-digit dates (YYYYMMDDHHmmss)
+    - Whoosh relative ranges ([-7 days to now], [now-1h TO now+2h])
+    - 8-digit dates with field awareness (created:20240115)
+    - Natural keywords (field:today, field:last_week, etc.)
+
+    Args:
+        query: Raw user query string
+        tz: Timezone for converting local date boundaries to UTC
+
+    Returns:
+        Query with date syntax rewritten to ISO 8601 ranges
+
+    Note:
+        Bare keywords without field prefixes pass through unchanged.
    """
    query = _rewrite_compact_date(query)
    query = _rewrite_whoosh_relative_range(query)
@@ -293,8 +308,18 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:

 def normalize_query(query: str) -> str:
    """
-    Join comma-separated field values with AND, collapse whitespace.
-    tag:foo,bar → tag:foo AND tag:bar
+    Normalize query syntax for better search behavior.
+
+    Expands comma-separated field values to explicit AND clauses and
+    collapses excessive whitespace for cleaner parsing:
+    - tag:foo,bar → tag:foo AND tag:bar
+    - multiple spaces → single spaces
+
+    Args:
+        query: Query string after date rewriting
+
+    Returns:
+        Normalized query string ready for Tantivy parsing
    """

    def _expand(m: re.Match[str]) -> str:
@@ -314,24 +339,27 @@ def build_permission_filter(
    user: AbstractBaseUser,
 ) -> tantivy.Query:
    """
-    Returns a Query matching documents visible to user:
-    - no owner (public)      → owner_id field absent (NULL in Django)
-    - owned by user          → owner_id = user.pk
-    - shared with user       → viewer_id = user.pk
+    Build a query filter for user document permissions.

-    Uses disjunction_max_query — boolean Should-only would match all docs.
+    Creates a query that matches only documents visible to the specified user
+    according to paperless-ngx permission rules:
+    - Public documents (no owner) are visible to all users
+    - Private documents are visible to their owner
+    - Documents explicitly shared with the user are visible

-    NOTE: all integer queries use range_query, not term_query, to avoid the
-    unsigned type-detection bug in tantivy-py 0.25 (lib.rs#L190 infers i64
-    before u64; confirmed empirically — term_query returns 0 for u64 fields).
-    Same root cause as issue #47 (from_dict) but the term_query path unfixed.
-    See: https://github.com/quickwit-oss/tantivy-py/blob/f51d851e857385ad2907241fbce8cf08309c3078/src/lib.rs#L190
-         https://github.com/quickwit-oss/tantivy-py/issues/47
+    Args:
+        schema: Tantivy schema for field validation
+        user: User to check permissions for

-    NOTE: no_owner uses boolean_query([Must(all), MustNot(range)]) because
-    exists_query is not available in 0.25.1. It is present in master and can
-    simplify this to MustNot(exists_query("owner_id")) once released.
-    See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi
+    Returns:
+        Tantivy query that filters results to visible documents
+
+    Implementation Notes:
+        - Uses range_query instead of term_query to work around unsigned integer
+          type detection bug in tantivy-py 0.25
+        - Uses boolean_query for "no owner" check since exists_query is not
+          available in tantivy-py 0.25.1 (available in master)
+        - Uses disjunction_max_query to combine permission clauses with OR logic
    """
    owner_any = tantivy.Query.range_query(
        schema,
@@ -380,12 +408,28 @@ def parse_user_query(
    raw_query: str,
    tz: tzinfo,
 ) -> tantivy.Query:
-    """Run the full query preprocessing pipeline: date rewriting → normalisation → Tantivy parse.
+    """
+    Parse user query through the complete preprocessing pipeline.

-    When ADVANCED_FUZZY_SEARCH_THRESHOLD is set (any float), a fuzzy query is blended in as a
-    Should clause boosted at 0.1 — keeping fuzzy hits ranked below exact matches. The fuzzy
-    query uses edit-distance=1, prefix=True, transposition_cost_one=True on all search fields.
-    The threshold float is a post-search minimum-score filter applied in the backend layer, not here.
+    Transforms the raw user query through multiple stages:
+    1. Date keyword rewriting (today → ISO 8601 ranges)
+    2. Query normalization (comma expansion, whitespace cleanup)
+    3. Tantivy parsing with field boosts
+    4. Optional fuzzy query blending (if ADVANCED_FUZZY_SEARCH_THRESHOLD set)
+
+    Args:
+        index: Tantivy index with registered tokenizers
+        raw_query: Original user query string
+        tz: Timezone for date boundary calculations
+
+    Returns:
+        Parsed Tantivy query ready for execution
+
+    Note:
+        When ADVANCED_FUZZY_SEARCH_THRESHOLD is configured, adds a low-priority
+        fuzzy query as a Should clause (0.1 boost) to catch approximate matches
+        while keeping exact matches ranked higher. The threshold value is applied
+        as a post-search score filter, not during query construction.
    """

    query_str = rewrite_natural_date_keywords(raw_query, tz)
@@ -16,7 +16,16 @@ SCHEMA_VERSION = 1


 def build_schema() -> tantivy.Schema:
-    """Build the Tantivy schema for the paperless document index."""
+    """
+    Build the Tantivy schema for the paperless document index.
+
+    Creates a comprehensive schema supporting full-text search, filtering,
+    sorting, and autocomplete functionality. Includes fields for document
+    content, metadata, permissions, custom fields, and notes.
+
+    Returns:
+        Configured Tantivy schema ready for index creation
+    """
    sb = tantivy.SchemaBuilder()

    sb.add_unsigned_field("id", stored=True, indexed=True, fast=True)
@@ -79,7 +88,19 @@ def build_schema() -> tantivy.Schema:


 def needs_rebuild(index_dir: Path) -> bool:
-    """Check if the search index needs rebuilding by comparing schema version and language sentinel files."""
+    """
+    Check if the search index needs rebuilding.
+
+    Compares the current schema version and search language configuration
+    against sentinel files to determine if the index is compatible with
+    the current paperless-ngx version and settings.
+
+    Args:
+        index_dir: Path to the search index directory
+
+    Returns:
+        True if the index needs rebuilding, False if it's up to date
+    """
    version_file = index_dir / ".schema_version"
    if not version_file.exists():
        return True
@@ -102,7 +123,15 @@ def needs_rebuild(index_dir: Path) -> bool:


 def wipe_index(index_dir: Path) -> None:
-    """Delete all children in the index directory to prepare for rebuild."""
+    """
+    Delete all contents of the index directory to prepare for rebuild.
+
+    Recursively removes all files and subdirectories within the index
+    directory while preserving the directory itself.
+
+    Args:
+        index_dir: Path to the search index directory to clear
+    """
    for child in list(index_dir.iterdir()):
        if child.is_dir():
            shutil.rmtree(child)
@@ -118,9 +147,17 @@ def _write_sentinels(index_dir: Path) -> None:

 def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
    """
-    Open the Tantivy index at index_dir (defaults to settings.INDEX_DIR),
-    creating or rebuilding as needed.
-    Caller must register custom tokenizers after receiving the Index.
+    Open the Tantivy index, creating or rebuilding as needed.
+
+    Checks if the index needs rebuilding due to schema version or language
+    changes. If rebuilding is needed, wipes the directory and creates a fresh
+    index with the current schema and configuration.
+
+    Args:
+        index_dir: Path to index directory (defaults to settings.INDEX_DIR)
+
+    Returns:
+        Opened Tantivy index (caller must register custom tokenizers)
    """
    if index_dir is None:
        index_dir = settings.INDEX_DIR
@@ -51,13 +51,21 @@ SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)

 def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
    """
-    Register all custom tokenizers on *index*. Must be called on every Index
-    instance — tantivy requires re-registration at each open.
+    Register all custom tokenizers required by the paperless schema.

-    simple_analyzer is also registered as a fast-field tokenizer because the
-    sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True.
-    Tantivy writes default values for fast columns on every commit, even for
-    documents that omit those fields, so the fast-field tokenizer must exist.
+    Must be called on every Index instance since Tantivy requires tokenizer
+    re-registration after each index open/creation. Registers tokenizers for
+    full-text search, sorting, CJK language support, and fast-field indexing.
+
+    Args:
+        index: Tantivy index instance to register tokenizers on
+        language: ISO 639-1 language code for stemming (None to disable)
+
+    Note:
+        simple_analyzer is registered as both a text and fast-field tokenizer
+        since sort shadow fields (title_sort, correspondent_sort, type_sort)
+        use fast=True and Tantivy requires fast-field tokenizers to exist
+        even for documents that omit those fields.
    """
    index.register_tokenizer("paperless_text", _paperless_text(language))
    index.register_tokenizer("simple_analyzer", _simple_analyzer())
@@ -16,7 +16,7 @@ class TestWriteBatch:
    """Test WriteBatch context manager functionality."""

    def test_rolls_back_on_exception(self, backend: TantivyBackend):
-        """Data integrity: a mid-batch exception must not corrupt the index."""
+        """Batch operations must rollback on exception to preserve index integrity."""
        doc = Document.objects.create(
            title="Rollback Target",
            content="should survive",
@@ -47,7 +47,7 @@ class TestSearch:
    """Test search functionality."""

    def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
-        """UI score bar depends on the top hit being 1.0."""
+        """Search scores must be normalized so top hit has score 1.0 for UI consistency."""
        for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
            doc = Document.objects.create(
                title=title,
@@ -68,7 +68,7 @@ class TestSearch:
        assert all(0.0 <= h["score"] <= 1.0 for h in r.hits)

    def test_owner_filter(self, backend: TantivyBackend):
-        """Owner can find their document; other user cannot."""
+        """Document owners can search their private documents; other users cannot access them."""
        owner = User.objects.create_user("owner")
        other = User.objects.create_user("other")
        doc = Document.objects.create(
@@ -108,7 +108,7 @@ class TestRebuild:
    """Test index rebuilding functionality."""

    def test_with_iter_wrapper_called(self, backend: TantivyBackend):
-        """rebuild() must pass documents through iter_wrapper."""
+        """Index rebuild must pass documents through iter_wrapper for progress tracking."""
        seen = []

        def wrapper(docs):
@@ -125,7 +125,7 @@ class TestAutocomplete:
    """Test autocomplete functionality."""

    def test_basic_functionality(self, backend: TantivyBackend):
-        """Autocomplete should find word prefixes."""
+        """Autocomplete must return words matching the given prefix."""
        doc = Document.objects.create(
            title="Invoice from Microsoft Corporation",
            content="payment details",
@@ -138,7 +138,7 @@ class TestAutocomplete:
        assert "microsoft" in results

    def test_results_ordered_by_document_frequency(self, backend: TantivyBackend):
-        """Most-used prefix match should rank first."""
+        """Autocomplete results must be ordered by document frequency to prioritize common terms."""
        # "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should
        # return "payment" before "payslip".
        for i, (title, checksum) in enumerate(
@@ -166,7 +166,7 @@ class TestMoreLikeThis:
    """Test more like this functionality."""

    def test_excludes_original(self, backend: TantivyBackend):
-        """More like this should not return the original document."""
+        """More like this queries must exclude the reference document from results."""
        doc1 = Document.objects.create(
            title="Important document",
            content="financial information",
@@ -197,9 +197,11 @@ class TestSingleton:
        reset_backend()

    def test_returns_same_instance_on_repeated_calls(self, index_dir):
+        """Singleton pattern: repeated calls to get_backend() must return the same instance."""
        assert get_backend() is get_backend()

    def test_reinitializes_when_index_dir_changes(self, tmp_path, settings):
+        """Backend singleton must reinitialize when INDEX_DIR setting changes for test isolation."""
        settings.INDEX_DIR = tmp_path / "a"
        (tmp_path / "a").mkdir()
        b1 = get_backend()
@@ -212,6 +214,7 @@ class TestSingleton:
        assert b2._path == tmp_path / "b"

    def test_reset_forces_new_instance(self, index_dir):
+        """reset_backend() must force creation of a new backend instance on next get_backend() call."""
        b1 = get_backend()
        reset_backend()
        b2 = get_backend()
@@ -222,7 +225,7 @@ class TestFieldHandling:
    """Test handling of various document fields."""

    def test_none_values_handled_correctly(self, backend: TantivyBackend):
-        """Test that None values for original_filename and page_count are handled properly."""
+        """Document fields with None values must not cause indexing errors."""
        doc = Document.objects.create(
            title="Test Doc",
            content="test content",
@@ -245,7 +248,7 @@ class TestFieldHandling:
        assert results.total == 1

    def test_custom_fields_include_name_and_value(self, backend: TantivyBackend):
-        """Custom field indexing should include both name and value."""
+        """Custom fields must be indexed with both field name and value for structured queries."""
        # Create a custom field
        field = CustomField.objects.create(
            name="Invoice Number",
@@ -277,7 +280,7 @@ class TestFieldHandling:
        assert results.total == 1

    def test_notes_include_user_information(self, backend: TantivyBackend):
-        """Notes should include user information when available."""
+        """Notes must be indexed with user information when available for structured queries."""
        user = User.objects.create_user("notewriter")
        doc = Document.objects.create(
            title="Doc with notes",
@@ -45,7 +45,7 @@ class TestTokenizers:
        self,
        content_index: tantivy.Index,
    ) -> None:
-        """paperless_text normalises diacritics so café is findable as cafe."""
+        """ASCII folding allows searching accented text with plain ASCII queries."""
        writer = content_index.writer()
        doc = tantivy.Document()
        doc.add_text("content", "café résumé")
@@ -56,7 +56,7 @@ class TestTokenizers:
        assert content_index.searcher().search(q, limit=5).count == 1

    def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None:
-        """bigram_analyzer makes CJK substrings searchable without whitespace."""
+        """Bigram tokenizer enables substring search in CJK languages without whitespace delimiters."""
        writer = bigram_index.writer()
        doc = tantivy.Document()
        doc.add_text("bigram_content", "東京都")
@@ -67,6 +67,7 @@ class TestTokenizers:
        assert bigram_index.searcher().search(q, limit=5).count == 1

    def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
+        """Unsupported language codes should log a warning and disable stemming gracefully."""
        sb = tantivy.SchemaBuilder()
        sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
        schema = sb.build()
@@ -106,6 +106,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@pytest.mark.django_db
 class TestMakeIndex:
    def test_reindex(self, mocker: MockerFixture) -> None:
+        """Reindex command must call the backend rebuild method to recreate the index."""
        mock_get_backend = mocker.patch(
            "documents.management.commands.document_index.get_backend",
        )
@@ -113,12 +114,14 @@ class TestMakeIndex:
        mock_get_backend.return_value.rebuild.assert_called_once()

    def test_optimize(self) -> None:
+        """Optimize command must execute without error (Tantivy handles optimization automatically)."""
        call_command("document_index", "optimize", skip_checks=True)

    def test_reindex_if_needed_skips_when_up_to_date(
        self,
        mocker: MockerFixture,
    ) -> None:
+        """Conditional reindex must skip rebuild when schema version and language match."""
        mocker.patch(
            "documents.management.commands.document_index.needs_rebuild",
            return_value=False,
@@ -133,6 +136,7 @@ class TestMakeIndex:
        self,
        mocker: MockerFixture,
    ) -> None:
+        """Conditional reindex must proceed with rebuild when schema version or language changed."""
        mocker.patch(
            "documents.management.commands.document_index.needs_rebuild",
            return_value=True,
@@ -23,8 +23,10 @@ from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import FileSystemAssertsMixin


-class TestIndexOptimize(TestCase):
+@pytest.mark.django_db
+class TestIndexOptimize:
    def test_index_optimize(self) -> None:
+        """Index optimization task must execute without error (Tantivy handles optimization automatically)."""
        tasks.index_optimize()