From e7f68c2082cdaaba77b7ad16ab6086a877fc3b8c Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Mon, 30 Mar 2026 15:54:18 -0700
Subject: [PATCH] docs: Enhance docstrings and test quality for Tantivy search
 backend

- Add comprehensive docstrings to all public methods and classes in the search package
  - Clarify purpose, parameters, return values, and implementation notes
  - Document thread safety, error handling, and usage patterns
  - Explain Tantivy-specific workarounds and design decisions

- Improve test quality and pytest compliance
  - Add descriptive comments explaining what each test verifies
  - Convert TestIndexOptimize to pytest style with @pytest.mark.django_db
  - Ensure all test docstrings focus on behavior verification rather than implementation

- Maintain existing functionality while improving code documentation
  - No changes to production logic or test coverage
  - All tests continue to pass with enhanced clarity

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../management/commands/document_index.py     |   7 +
 src/documents/search/_backend.py              | 214 +++++++++++++++---
 src/documents/search/_query.py                |  94 ++++++--
 src/documents/search/_schema.py               |  49 +++-
 src/documents/search/_tokenizer.py            |  20 +-
 src/documents/tests/search/test_backend.py    |  23 +-
 src/documents/tests/search/test_tokenizer.py  |   5 +-
 src/documents/tests/test_management.py        |   4 +
 src/documents/tests/test_tasks.py             |   4 +-
 9 files changed, 342 insertions(+), 78 deletions(-)

diff --git a/src/documents/management/commands/document_index.py b/src/documents/management/commands/document_index.py
index 598719024..c4f72dd3a 100644
--- a/src/documents/management/commands/document_index.py
+++ b/src/documents/management/commands/document_index.py
@@ -14,6 +14,13 @@ logger = logging.getLogger("paperless.management.document_index")
 
 
 class Command(PaperlessCommand):
+    """
+    Django management command for search index operations.
+
+    Provides subcommands for reindexing documents and optimizing the search index.
+    Supports conditional reindexing based on schema version and language changes.
+    """
+
     help = "Manages the document index."
 
     supports_progress_bar = True
diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py
index fc1c57262..6bee65f1e 100644
--- a/src/documents/search/_backend.py
+++ b/src/documents/search/_backend.py
@@ -46,12 +46,17 @@ T = TypeVar("T")
 
 
 def _identity(iterable: Iterable[T]) -> Iterable[T]:
-    """Default iter_wrapper that passes through unchanged."""
+    """Default iter_wrapper that passes documents through unchanged for indexing."""
     return iterable
 
 
 def _ascii_fold(s: str) -> str:
-    """Normalize unicode to ASCII equivalent characters."""
+    """
+    Normalize unicode to ASCII equivalent characters for search consistency.
+
+    Converts accented characters (e.g., "café") to their ASCII base forms ("cafe")
+    to enable cross-language searching without requiring exact diacritic matching.
+    """
     return unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode()
 
 
@@ -91,17 +96,33 @@ class SearchHit(TypedDict):
 
 @dataclass(frozen=True, slots=True)
 class SearchResults:
+    """
+    Container for search results with pagination metadata.
+
+    Attributes:
+        hits: List of search results with scores and highlights
+        total: Total matching documents across all pages (for pagination)
+        query: Preprocessed query string after date/syntax rewriting
+    """
+
     hits: list[SearchHit]
     total: int  # total matching documents (for pagination)
     query: str  # preprocessed query string
 
 
 class TantivyRelevanceList:
-    """DRF-compatible list wrapper for Tantivy search hits.
+    """
+    DRF-compatible list wrapper for Tantivy search hits.
 
-    __len__ returns the total hit count (for pagination); __getitem__ slices
-    the hit list.  Stores ALL post-filter hits so that get_all_result_ids()
-    can return every matching doc ID without a second query.
+    Provides paginated access to search results while storing all hits in memory
+    for efficient ID retrieval. Used by Django REST framework for pagination.
+
+    Methods:
+        __len__: Returns total hit count for pagination calculations
+        __getitem__: Slices the hit list for page-specific results
+
+    Note: Stores ALL post-filter hits so get_all_result_ids() can return
+    every matching document ID without requiring a second search query.
     """
 
     def __init__(self, hits: list[SearchHit]) -> None:
@@ -115,11 +136,22 @@ class TantivyRelevanceList:
 
 
 class SearchIndexLockError(Exception):
-    pass
+    """Raised when the search index file lock cannot be acquired within the timeout."""
 
 
 class WriteBatch:
-    """Context manager for bulk index operations with file locking."""
+    """
+    Context manager for bulk index operations with file locking.
+
+    Provides transactional batch updates to the search index with proper
+    concurrency control via file locking. All operations within the batch
+    are committed atomically or rolled back on exception.
+
+    Usage:
+        with backend.batch_update() as batch:
+            batch.add_or_update(document)
+            batch.remove(doc_id)
+    """
 
     def __init__(self, backend: TantivyBackend, lock_timeout: float):
         self._backend = backend
@@ -160,18 +192,29 @@ class WriteBatch:
         document: Document,
         effective_content: str | None = None,
     ) -> None:
-        """Add or update a document in the batch.
+        """
+        Add or update a document in the batch.
 
-        Tantivy has no native upsert — we delete by id then re-add so
-        stale copies (e.g. after a permission change) don't linger.
-        ``effective_content`` overrides ``document.content`` for indexing.
+        Implements upsert behavior by deleting any existing document with the same ID
+        and adding the new version. This ensures stale document data (e.g., after
+        permission changes) doesn't persist in the index.
+
+        Args:
+            document: Django Document instance to index
+            effective_content: Override document.content for indexing (used when
+                re-indexing with newer OCR text from document versions)
         """
         self.remove(document.pk)
         doc = self._backend._build_tantivy_doc(document, effective_content)
         self._writer.add_document(doc)
 
     def remove(self, doc_id: int) -> None:
-        """Remove a document from the batch."""
+        """
+        Remove a document from the batch by its primary key.
+
+        Uses range query instead of term query to work around unsigned integer
+        type detection bug in tantivy-py 0.25.
+        """
         # Use range query to work around u64 deletion bug
         self._writer.delete_documents_by_query(
             tantivy.Query.range_query(
@@ -185,7 +228,17 @@ class WriteBatch:
 
 
 class TantivyBackend:
-    """Tantivy search backend with explicit lifecycle management."""
+    """
+    Tantivy search backend with explicit lifecycle management.
+
+    Provides full-text search capabilities using the Tantivy search engine.
+    Supports in-memory indexes (for testing) and persistent on-disk indexes
+    (for production use). Handles document indexing, search queries, autocompletion,
+    and "more like this" functionality.
+
+    The backend manages its own connection lifecycle and can be reset when
+    the underlying index directory changes (e.g., during test isolation).
+    """
 
     def __init__(self, path: Path | None = None):
         # path=None → in-memory index (for tests)
@@ -195,7 +248,13 @@ class TantivyBackend:
         self._schema = None
 
     def open(self) -> None:
-        """Open or rebuild the index. Idempotent."""
+        """
+        Open or rebuild the index as needed.
+
+        For disk-based indexes, checks if rebuilding is needed due to schema
+        version or language changes. Registers custom tokenizers after opening.
+        Safe to call multiple times - subsequent calls are no-ops.
+        """
         if self._index is not None:
             return
         if self._path is not None:
@@ -206,7 +265,11 @@ class TantivyBackend:
         self._schema = self._index.schema
 
     def close(self) -> None:
-        """Close the index. Idempotent."""
+        """
+        Close the index and release resources.
+
+        Safe to call multiple times - subsequent calls are no-ops.
+        """
         self._index = None
         self._schema = None
 
@@ -339,13 +402,30 @@ class TantivyBackend:
         document: Document,
         effective_content: str | None = None,
     ) -> None:
-        """Add or update a single document with file locking."""
+        """
+        Add or update a single document with file locking.
+
+        Convenience method for single-document updates. For bulk operations,
+        use batch_update() context manager for better performance.
+
+        Args:
+            document: Django Document instance to index
+            effective_content: Override document.content for indexing
+        """
         self._ensure_open()
         with self.batch_update(lock_timeout=5.0) as batch:
             batch.add_or_update(document, effective_content)
 
     def remove(self, doc_id: int) -> None:
-        """Remove a single document with file locking."""
+        """
+        Remove a single document from the index with file locking.
+
+        Convenience method for single-document removal. For bulk operations,
+        use batch_update() context manager for better performance.
+
+        Args:
+            doc_id: Primary key of the document to remove
+        """
         self._ensure_open()
         with self.batch_update(lock_timeout=5.0) as batch:
             batch.remove(doc_id)
@@ -360,7 +440,24 @@ class TantivyBackend:
         *,
         sort_reverse: bool,
     ) -> SearchResults:
-        """Search the index."""
+        """
+        Execute a search query against the document index.
+
+        Processes the user query through date rewriting, normalization, and
+        permission filtering before executing against Tantivy. Supports both
+        relevance-based and field-based sorting.
+
+        Args:
+            query: User's search query (supports natural date keywords, field filters)
+            user: User for permission filtering (None for superuser/no filtering)
+            page: Page number (1-indexed) for pagination
+            page_size: Number of results per page
+            sort_field: Field to sort by (None for relevance ranking)
+            sort_reverse: Whether to reverse the sort order
+
+        Returns:
+            SearchResults with hits, total count, and processed query
+        """
         self._ensure_open()
         tz = get_current_timezone()
         user_query = parse_user_query(self._index, query, tz)
@@ -491,7 +588,21 @@ class TantivyBackend:
         limit: int,
         user: AbstractBaseUser | None = None,
     ) -> list[str]:
-        """Get autocomplete suggestions, optionally filtered by user visibility."""
+        """
+        Get autocomplete suggestions for search queries.
+
+        Returns words that start with the given term prefix, ranked by document
+        frequency (how many documents contain each word). Optionally filters
+        results to only words from documents visible to the specified user.
+
+        Args:
+            term: Prefix to match against autocomplete words
+            limit: Maximum number of suggestions to return
+            user: User for permission filtering (None for no filtering)
+
+        Returns:
+            List of word suggestions ordered by frequency, then alphabetically
+        """
         self._ensure_open()
         normalized_term = _ascii_fold(term.lower())
 
@@ -533,7 +644,21 @@ class TantivyBackend:
         page: int,
         page_size: int,
     ) -> SearchResults:
-        """Find documents similar to the given document."""
+        """
+        Find documents similar to the given document using content analysis.
+
+        Uses Tantivy's "more like this" query to find documents with similar
+        content patterns. The original document is excluded from results.
+
+        Args:
+            doc_id: Primary key of the reference document
+            user: User for permission filtering (None for no filtering)
+            page: Page number (1-indexed) for pagination
+            page_size: Number of results per page
+
+        Returns:
+            SearchResults with similar documents (excluding the original)
+        """
         self._ensure_open()
         searcher = self._index.searcher()
 
@@ -621,12 +746,36 @@ class TantivyBackend:
         )
 
     def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
-        """Get a batch context manager for bulk operations."""
+        """
+        Get a batch context manager for bulk index operations.
+
+        Use this for efficient bulk document updates/deletions. All operations
+        within the batch are committed atomically at the end of the context.
+
+        Args:
+            lock_timeout: Seconds to wait for file lock acquisition
+
+        Returns:
+            WriteBatch context manager
+
+        Raises:
+            SearchIndexLockError: If lock cannot be acquired within timeout
+        """
         self._ensure_open()
         return WriteBatch(self, lock_timeout)
 
     def rebuild(self, documents: QuerySet, iter_wrapper: Callable = _identity) -> None:
-        """Rebuild the entire search index."""
+        """
+        Rebuild the entire search index from scratch.
+
+        Wipes the existing index and re-indexes all provided documents.
+        On failure, restores the previous index state to keep the backend usable.
+
+        Args:
+            documents: QuerySet of Document instances to index
+            iter_wrapper: Optional wrapper function for progress tracking
+                (e.g., progress bar). Should yield each document unchanged.
+        """
         # Create new index (on-disk or in-memory)
         if self._path is not None:
             wipe_index(self._path)
@@ -662,11 +811,15 @@ _backend_lock = threading.RLock()
 
 
 def get_backend() -> TantivyBackend:
-    """Get the global backend instance with thread safety.
+    """
+    Get the global backend instance with thread safety.
 
-    Automatically reinitializes when settings.INDEX_DIR changes — this fixes
-    the xdist/override_settings isolation issue where each test may set a
-    different INDEX_DIR but would otherwise share a stale singleton.
+    Returns a singleton TantivyBackend instance, automatically reinitializing
+    when settings.INDEX_DIR changes. This ensures proper test isolation when
+    using pytest-xdist or @override_settings that change the index directory.
+
+    Returns:
+        Thread-safe singleton TantivyBackend instance
     """
     global _backend, _backend_path
 
@@ -693,7 +846,12 @@ def get_backend() -> TantivyBackend:
 
 
 def reset_backend() -> None:
-    """Reset the global backend instance with thread safety."""
+    """
+    Reset the global backend instance with thread safety.
+
+    Forces creation of a new backend instance on the next get_backend() call.
+    Used for test isolation and when switching between different index directories.
+    """
     global _backend, _backend_path
 
     with _backend_lock:
diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py
index b58ebe430..4dfd025a4 100644
--- a/src/documents/search/_query.py
+++ b/src/documents/search/_query.py
@@ -273,9 +273,24 @@ def _rewrite_8digit_date(query: str, tz: tzinfo) -> str:
 
 def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
     """
-    Preprocessing stage 1: rewrite Whoosh compact dates, relative ranges,
-    and natural date keywords (field:today etc.) to ISO 8601.
-    Bare keywords without a field: prefix pass through unchanged.
+    Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility.
+
+    Performs the first stage of query preprocessing, converting various date
+    formats and keywords to ISO 8601 datetime ranges that Tantivy can parse:
+    - Compact 14-digit dates (YYYYMMDDHHmmss)
+    - Whoosh relative ranges ([-7 days to now], [now-1h TO now+2h])
+    - 8-digit dates with field awareness (created:20240115)
+    - Natural keywords (field:today, field:last_week, etc.)
+
+    Args:
+        query: Raw user query string
+        tz: Timezone for converting local date boundaries to UTC
+
+    Returns:
+        Query with date syntax rewritten to ISO 8601 ranges
+
+    Note:
+        Bare keywords without field prefixes pass through unchanged.
     """
     query = _rewrite_compact_date(query)
     query = _rewrite_whoosh_relative_range(query)
@@ -293,8 +308,18 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
 
 def normalize_query(query: str) -> str:
     """
-    Join comma-separated field values with AND, collapse whitespace.
-    tag:foo,bar → tag:foo AND tag:bar
+    Normalize query syntax for better search behavior.
+
+    Expands comma-separated field values to explicit AND clauses and
+    collapses excessive whitespace for cleaner parsing:
+    - tag:foo,bar → tag:foo AND tag:bar
+    - multiple spaces → single spaces
+
+    Args:
+        query: Query string after date rewriting
+
+    Returns:
+        Normalized query string ready for Tantivy parsing
     """
 
     def _expand(m: re.Match[str]) -> str:
@@ -314,24 +339,27 @@ def build_permission_filter(
     user: AbstractBaseUser,
 ) -> tantivy.Query:
     """
-    Returns a Query matching documents visible to user:
-    - no owner (public)      → owner_id field absent (NULL in Django)
-    - owned by user          → owner_id = user.pk
-    - shared with user       → viewer_id = user.pk
+    Build a query filter for user document permissions.
 
-    Uses disjunction_max_query — boolean Should-only would match all docs.
+    Creates a query that matches only documents visible to the specified user
+    according to paperless-ngx permission rules:
+    - Public documents (no owner) are visible to all users
+    - Private documents are visible to their owner
+    - Documents explicitly shared with the user are visible
 
-    NOTE: all integer queries use range_query, not term_query, to avoid the
-    unsigned type-detection bug in tantivy-py 0.25 (lib.rs#L190 infers i64
-    before u64; confirmed empirically — term_query returns 0 for u64 fields).
-    Same root cause as issue #47 (from_dict) but the term_query path unfixed.
-    See: https://github.com/quickwit-oss/tantivy-py/blob/f51d851e857385ad2907241fbce8cf08309c3078/src/lib.rs#L190
-         https://github.com/quickwit-oss/tantivy-py/issues/47
+    Args:
+        schema: Tantivy schema for field validation
+        user: User to check permissions for
 
-    NOTE: no_owner uses boolean_query([Must(all), MustNot(range)]) because
-    exists_query is not available in 0.25.1. It is present in master and can
-    simplify this to MustNot(exists_query("owner_id")) once released.
-    See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi
+    Returns:
+        Tantivy query that filters results to visible documents
+
+    Implementation Notes:
+        - Uses range_query instead of term_query to work around unsigned integer
+          type detection bug in tantivy-py 0.25
+        - Uses boolean_query for "no owner" check since exists_query is not
+          available in tantivy-py 0.25.1 (available in master)
+        - Uses disjunction_max_query to combine permission clauses with OR logic
     """
     owner_any = tantivy.Query.range_query(
         schema,
@@ -380,12 +408,28 @@ def parse_user_query(
     raw_query: str,
     tz: tzinfo,
 ) -> tantivy.Query:
-    """Run the full query preprocessing pipeline: date rewriting → normalisation → Tantivy parse.
+    """
+    Parse user query through the complete preprocessing pipeline.
 
-    When ADVANCED_FUZZY_SEARCH_THRESHOLD is set (any float), a fuzzy query is blended in as a
-    Should clause boosted at 0.1 — keeping fuzzy hits ranked below exact matches. The fuzzy
-    query uses edit-distance=1, prefix=True, transposition_cost_one=True on all search fields.
-    The threshold float is a post-search minimum-score filter applied in the backend layer, not here.
+    Transforms the raw user query through multiple stages:
+    1. Date keyword rewriting (today → ISO 8601 ranges)
+    2. Query normalization (comma expansion, whitespace cleanup)
+    3. Tantivy parsing with field boosts
+    4. Optional fuzzy query blending (if ADVANCED_FUZZY_SEARCH_THRESHOLD set)
+
+    Args:
+        index: Tantivy index with registered tokenizers
+        raw_query: Original user query string
+        tz: Timezone for date boundary calculations
+
+    Returns:
+        Parsed Tantivy query ready for execution
+
+    Note:
+        When ADVANCED_FUZZY_SEARCH_THRESHOLD is configured, adds a low-priority
+        fuzzy query as a Should clause (0.1 boost) to catch approximate matches
+        while keeping exact matches ranked higher. The threshold value is applied
+        as a post-search score filter, not during query construction.
     """
 
     query_str = rewrite_natural_date_keywords(raw_query, tz)
diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py
index ef7e4a921..18575cb4c 100644
--- a/src/documents/search/_schema.py
+++ b/src/documents/search/_schema.py
@@ -16,7 +16,16 @@ SCHEMA_VERSION = 1
 
 
 def build_schema() -> tantivy.Schema:
-    """Build the Tantivy schema for the paperless document index."""
+    """
+    Build the Tantivy schema for the paperless document index.
+
+    Creates a comprehensive schema supporting full-text search, filtering,
+    sorting, and autocomplete functionality. Includes fields for document
+    content, metadata, permissions, custom fields, and notes.
+
+    Returns:
+        Configured Tantivy schema ready for index creation
+    """
     sb = tantivy.SchemaBuilder()
 
     sb.add_unsigned_field("id", stored=True, indexed=True, fast=True)
@@ -79,7 +88,19 @@ def build_schema() -> tantivy.Schema:
 
 
 def needs_rebuild(index_dir: Path) -> bool:
-    """Check if the search index needs rebuilding by comparing schema version and language sentinel files."""
+    """
+    Check if the search index needs rebuilding.
+
+    Compares the current schema version and search language configuration
+    against sentinel files to determine if the index is compatible with
+    the current paperless-ngx version and settings.
+
+    Args:
+        index_dir: Path to the search index directory
+
+    Returns:
+        True if the index needs rebuilding, False if it's up to date
+    """
     version_file = index_dir / ".schema_version"
     if not version_file.exists():
         return True
@@ -102,7 +123,15 @@ def needs_rebuild(index_dir: Path) -> bool:
 
 
 def wipe_index(index_dir: Path) -> None:
-    """Delete all children in the index directory to prepare for rebuild."""
+    """
+    Delete all contents of the index directory to prepare for rebuild.
+
+    Recursively removes all files and subdirectories within the index
+    directory while preserving the directory itself.
+
+    Args:
+        index_dir: Path to the search index directory to clear
+    """
     for child in list(index_dir.iterdir()):
         if child.is_dir():
             shutil.rmtree(child)
@@ -118,9 +147,17 @@ def _write_sentinels(index_dir: Path) -> None:
 
 def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
     """
-    Open the Tantivy index at index_dir (defaults to settings.INDEX_DIR),
-    creating or rebuilding as needed.
-    Caller must register custom tokenizers after receiving the Index.
+    Open the Tantivy index, creating or rebuilding as needed.
+
+    Checks if the index needs rebuilding due to schema version or language
+    changes. If rebuilding is needed, wipes the directory and creates a fresh
+    index with the current schema and configuration.
+
+    Args:
+        index_dir: Path to index directory (defaults to settings.INDEX_DIR)
+
+    Returns:
+        Opened Tantivy index (caller must register custom tokenizers)
     """
     if index_dir is None:
         index_dir = settings.INDEX_DIR
diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py
index 628b9dcdb..e597a879e 100644
--- a/src/documents/search/_tokenizer.py
+++ b/src/documents/search/_tokenizer.py
@@ -51,13 +51,21 @@ SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
 
 def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
     """
-    Register all custom tokenizers on *index*. Must be called on every Index
-    instance — tantivy requires re-registration at each open.
+    Register all custom tokenizers required by the paperless schema.
 
-    simple_analyzer is also registered as a fast-field tokenizer because the
-    sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True.
-    Tantivy writes default values for fast columns on every commit, even for
-    documents that omit those fields, so the fast-field tokenizer must exist.
+    Must be called on every Index instance since Tantivy requires tokenizer
+    re-registration after each index open/creation. Registers tokenizers for
+    full-text search, sorting, CJK language support, and fast-field indexing.
+
+    Args:
+        index: Tantivy index instance to register tokenizers on
+        language: ISO 639-1 language code for stemming (None to disable)
+
+    Note:
+        simple_analyzer is registered as both a text and fast-field tokenizer
+        since sort shadow fields (title_sort, correspondent_sort, type_sort)
+        use fast=True and Tantivy requires fast-field tokenizers to exist
+        even for documents that omit those fields.
     """
     index.register_tokenizer("paperless_text", _paperless_text(language))
     index.register_tokenizer("simple_analyzer", _simple_analyzer())
diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py
index 23adfda85..f540d6d51 100644
--- a/src/documents/tests/search/test_backend.py
+++ b/src/documents/tests/search/test_backend.py
@@ -16,7 +16,7 @@ class TestWriteBatch:
     """Test WriteBatch context manager functionality."""
 
     def test_rolls_back_on_exception(self, backend: TantivyBackend):
-        """Data integrity: a mid-batch exception must not corrupt the index."""
+        """Batch operations must rollback on exception to preserve index integrity."""
         doc = Document.objects.create(
             title="Rollback Target",
             content="should survive",
@@ -47,7 +47,7 @@ class TestSearch:
     """Test search functionality."""
 
     def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
-        """UI score bar depends on the top hit being 1.0."""
+        """Search scores must be normalized so top hit has score 1.0 for UI consistency."""
         for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
             doc = Document.objects.create(
                 title=title,
@@ -68,7 +68,7 @@ class TestSearch:
         assert all(0.0 <= h["score"] <= 1.0 for h in r.hits)
 
     def test_owner_filter(self, backend: TantivyBackend):
-        """Owner can find their document; other user cannot."""
+        """Document owners can search their private documents; other users cannot access them."""
         owner = User.objects.create_user("owner")
         other = User.objects.create_user("other")
         doc = Document.objects.create(
@@ -108,7 +108,7 @@ class TestRebuild:
     """Test index rebuilding functionality."""
 
     def test_with_iter_wrapper_called(self, backend: TantivyBackend):
-        """rebuild() must pass documents through iter_wrapper."""
+        """Index rebuild must pass documents through iter_wrapper for progress tracking."""
         seen = []
 
         def wrapper(docs):
@@ -125,7 +125,7 @@ class TestAutocomplete:
     """Test autocomplete functionality."""
 
     def test_basic_functionality(self, backend: TantivyBackend):
-        """Autocomplete should find word prefixes."""
+        """Autocomplete must return words matching the given prefix."""
         doc = Document.objects.create(
             title="Invoice from Microsoft Corporation",
             content="payment details",
@@ -138,7 +138,7 @@ class TestAutocomplete:
         assert "microsoft" in results
 
     def test_results_ordered_by_document_frequency(self, backend: TantivyBackend):
-        """Most-used prefix match should rank first."""
+        """Autocomplete results must be ordered by document frequency to prioritize common terms."""
         # "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should
         # return "payment" before "payslip".
         for i, (title, checksum) in enumerate(
@@ -166,7 +166,7 @@ class TestMoreLikeThis:
     """Test more like this functionality."""
 
     def test_excludes_original(self, backend: TantivyBackend):
-        """More like this should not return the original document."""
+        """More like this queries must exclude the reference document from results."""
         doc1 = Document.objects.create(
             title="Important document",
             content="financial information",
@@ -197,9 +197,11 @@ class TestSingleton:
         reset_backend()
 
     def test_returns_same_instance_on_repeated_calls(self, index_dir):
+        """Singleton pattern: repeated calls to get_backend() must return the same instance."""
         assert get_backend() is get_backend()
 
     def test_reinitializes_when_index_dir_changes(self, tmp_path, settings):
+        """Backend singleton must reinitialize when INDEX_DIR setting changes for test isolation."""
         settings.INDEX_DIR = tmp_path / "a"
         (tmp_path / "a").mkdir()
         b1 = get_backend()
@@ -212,6 +214,7 @@ class TestSingleton:
         assert b2._path == tmp_path / "b"
 
     def test_reset_forces_new_instance(self, index_dir):
+        """reset_backend() must force creation of a new backend instance on next get_backend() call."""
         b1 = get_backend()
         reset_backend()
         b2 = get_backend()
@@ -222,7 +225,7 @@ class TestFieldHandling:
     """Test handling of various document fields."""
 
     def test_none_values_handled_correctly(self, backend: TantivyBackend):
-        """Test that None values for original_filename and page_count are handled properly."""
+        """Document fields with None values must not cause indexing errors."""
         doc = Document.objects.create(
             title="Test Doc",
             content="test content",
@@ -245,7 +248,7 @@ class TestFieldHandling:
         assert results.total == 1
 
     def test_custom_fields_include_name_and_value(self, backend: TantivyBackend):
-        """Custom field indexing should include both name and value."""
+        """Custom fields must be indexed with both field name and value for structured queries."""
         # Create a custom field
         field = CustomField.objects.create(
             name="Invoice Number",
@@ -277,7 +280,7 @@ class TestFieldHandling:
         assert results.total == 1
 
     def test_notes_include_user_information(self, backend: TantivyBackend):
-        """Notes should include user information when available."""
+        """Notes must be indexed with user information when available for structured queries."""
         user = User.objects.create_user("notewriter")
         doc = Document.objects.create(
             title="Doc with notes",
diff --git a/src/documents/tests/search/test_tokenizer.py b/src/documents/tests/search/test_tokenizer.py
index 6728784ae..aee52a567 100644
--- a/src/documents/tests/search/test_tokenizer.py
+++ b/src/documents/tests/search/test_tokenizer.py
@@ -45,7 +45,7 @@ class TestTokenizers:
         self,
         content_index: tantivy.Index,
     ) -> None:
-        """paperless_text normalises diacritics so café is findable as cafe."""
+        """ASCII folding allows searching accented text with plain ASCII queries."""
         writer = content_index.writer()
         doc = tantivy.Document()
         doc.add_text("content", "café résumé")
@@ -56,7 +56,7 @@ class TestTokenizers:
         assert content_index.searcher().search(q, limit=5).count == 1
 
     def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None:
-        """bigram_analyzer makes CJK substrings searchable without whitespace."""
+        """Bigram tokenizer enables substring search in CJK languages without whitespace delimiters."""
         writer = bigram_index.writer()
         doc = tantivy.Document()
         doc.add_text("bigram_content", "東京都")
@@ -67,6 +67,7 @@ class TestTokenizers:
         assert bigram_index.searcher().search(q, limit=5).count == 1
 
     def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
+        """Unsupported language codes should log a warning and disable stemming gracefully."""
         sb = tantivy.SchemaBuilder()
         sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
         schema = sb.build()
diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py
index f6c8ba904..3e61b74f8 100644
--- a/src/documents/tests/test_management.py
+++ b/src/documents/tests/test_management.py
@@ -106,6 +106,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
 @pytest.mark.django_db
 class TestMakeIndex:
     def test_reindex(self, mocker: MockerFixture) -> None:
+        """Reindex command must call the backend rebuild method to recreate the index."""
         mock_get_backend = mocker.patch(
             "documents.management.commands.document_index.get_backend",
         )
@@ -113,12 +114,14 @@ class TestMakeIndex:
         mock_get_backend.return_value.rebuild.assert_called_once()
 
     def test_optimize(self) -> None:
+        """Optimize command must execute without error (Tantivy handles optimization automatically)."""
         call_command("document_index", "optimize", skip_checks=True)
 
     def test_reindex_if_needed_skips_when_up_to_date(
         self,
         mocker: MockerFixture,
     ) -> None:
+        """Conditional reindex must skip rebuild when schema version and language match."""
         mocker.patch(
             "documents.management.commands.document_index.needs_rebuild",
             return_value=False,
@@ -133,6 +136,7 @@ class TestMakeIndex:
         self,
         mocker: MockerFixture,
     ) -> None:
+        """Conditional reindex must proceed with rebuild when schema version or language changed."""
         mocker.patch(
             "documents.management.commands.document_index.needs_rebuild",
             return_value=True,
diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py
index d73481609..9fb9ddbc6 100644
--- a/src/documents/tests/test_tasks.py
+++ b/src/documents/tests/test_tasks.py
@@ -23,8 +23,10 @@ from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import FileSystemAssertsMixin
 
 
-class TestIndexOptimize(TestCase):
+@pytest.mark.django_db
+class TestIndexOptimize:
     def test_index_optimize(self) -> None:
+        """Index optimization task must execute without error (Tantivy handles optimization automatically)."""
         tasks.index_optimize()