From e7f68c2082cdaaba77b7ad16ab6086a877fc3b8c Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 30 Mar 2026 15:54:18 -0700 Subject: [PATCH] docs: Enhance docstrings and test quality for Tantivy search backend - Add comprehensive docstrings to all public methods and classes in the search package - Clarify purpose, parameters, return values, and implementation notes - Document thread safety, error handling, and usage patterns - Explain Tantivy-specific workarounds and design decisions - Improve test quality and pytest compliance - Add descriptive comments explaining what each test verifies - Convert TestIndexOptimize to pytest style with @pytest.mark.django_db - Ensure all test docstrings focus on behavior verification rather than implementation - Maintain existing functionality while improving code documentation - No changes to production logic or test coverage - All tests continue to pass with enhanced clarity Co-Authored-By: Claude Sonnet 4.6 --- .../management/commands/document_index.py | 7 + src/documents/search/_backend.py | 214 +++++++++++++++--- src/documents/search/_query.py | 94 ++++++-- src/documents/search/_schema.py | 49 +++- src/documents/search/_tokenizer.py | 20 +- src/documents/tests/search/test_backend.py | 23 +- src/documents/tests/search/test_tokenizer.py | 5 +- src/documents/tests/test_management.py | 4 + src/documents/tests/test_tasks.py | 4 +- 9 files changed, 342 insertions(+), 78 deletions(-) diff --git a/src/documents/management/commands/document_index.py b/src/documents/management/commands/document_index.py index 598719024..c4f72dd3a 100644 --- a/src/documents/management/commands/document_index.py +++ b/src/documents/management/commands/document_index.py @@ -14,6 +14,13 @@ logger = logging.getLogger("paperless.management.document_index") class Command(PaperlessCommand): + """ + Django management command for search index operations. + + Provides subcommands for reindexing documents and optimizing the search index. + Supports conditional reindexing based on schema version and language changes. + """ + help = "Manages the document index." supports_progress_bar = True diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index fc1c57262..6bee65f1e 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -46,12 +46,17 @@ T = TypeVar("T") def _identity(iterable: Iterable[T]) -> Iterable[T]: - """Default iter_wrapper that passes through unchanged.""" + """Default iter_wrapper that passes documents through unchanged for indexing.""" return iterable def _ascii_fold(s: str) -> str: - """Normalize unicode to ASCII equivalent characters.""" + """ + Normalize unicode to ASCII equivalent characters for search consistency. + + Converts accented characters (e.g., "café") to their ASCII base forms ("cafe") + to enable cross-language searching without requiring exact diacritic matching. + """ return unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode() @@ -91,17 +96,33 @@ class SearchHit(TypedDict): @dataclass(frozen=True, slots=True) class SearchResults: + """ + Container for search results with pagination metadata. + + Attributes: + hits: List of search results with scores and highlights + total: Total matching documents across all pages (for pagination) + query: Preprocessed query string after date/syntax rewriting + """ + hits: list[SearchHit] total: int # total matching documents (for pagination) query: str # preprocessed query string class TantivyRelevanceList: - """DRF-compatible list wrapper for Tantivy search hits. + """ + DRF-compatible list wrapper for Tantivy search hits. - __len__ returns the total hit count (for pagination); __getitem__ slices - the hit list. Stores ALL post-filter hits so that get_all_result_ids() - can return every matching doc ID without a second query. + Provides paginated access to search results while storing all hits in memory + for efficient ID retrieval. Used by Django REST framework for pagination. + + Methods: + __len__: Returns total hit count for pagination calculations + __getitem__: Slices the hit list for page-specific results + + Note: Stores ALL post-filter hits so get_all_result_ids() can return + every matching document ID without requiring a second search query. """ def __init__(self, hits: list[SearchHit]) -> None: @@ -115,11 +136,22 @@ class TantivyRelevanceList: class SearchIndexLockError(Exception): - pass + """Raised when the search index file lock cannot be acquired within the timeout.""" class WriteBatch: - """Context manager for bulk index operations with file locking.""" + """ + Context manager for bulk index operations with file locking. + + Provides transactional batch updates to the search index with proper + concurrency control via file locking. All operations within the batch + are committed atomically or rolled back on exception. + + Usage: + with backend.batch_update() as batch: + batch.add_or_update(document) + batch.remove(doc_id) + """ def __init__(self, backend: TantivyBackend, lock_timeout: float): self._backend = backend @@ -160,18 +192,29 @@ class WriteBatch: document: Document, effective_content: str | None = None, ) -> None: - """Add or update a document in the batch. + """ + Add or update a document in the batch. - Tantivy has no native upsert — we delete by id then re-add so - stale copies (e.g. after a permission change) don't linger. - ``effective_content`` overrides ``document.content`` for indexing. + Implements upsert behavior by deleting any existing document with the same ID + and adding the new version. This ensures stale document data (e.g., after + permission changes) doesn't persist in the index. + + Args: + document: Django Document instance to index + effective_content: Override document.content for indexing (used when + re-indexing with newer OCR text from document versions) """ self.remove(document.pk) doc = self._backend._build_tantivy_doc(document, effective_content) self._writer.add_document(doc) def remove(self, doc_id: int) -> None: - """Remove a document from the batch.""" + """ + Remove a document from the batch by its primary key. + + Uses range query instead of term query to work around unsigned integer + type detection bug in tantivy-py 0.25. + """ # Use range query to work around u64 deletion bug self._writer.delete_documents_by_query( tantivy.Query.range_query( @@ -185,7 +228,17 @@ class WriteBatch: class TantivyBackend: - """Tantivy search backend with explicit lifecycle management.""" + """ + Tantivy search backend with explicit lifecycle management. + + Provides full-text search capabilities using the Tantivy search engine. + Supports in-memory indexes (for testing) and persistent on-disk indexes + (for production use). Handles document indexing, search queries, autocompletion, + and "more like this" functionality. + + The backend manages its own connection lifecycle and can be reset when + the underlying index directory changes (e.g., during test isolation). + """ def __init__(self, path: Path | None = None): # path=None → in-memory index (for tests) @@ -195,7 +248,13 @@ class TantivyBackend: self._schema = None def open(self) -> None: - """Open or rebuild the index. Idempotent.""" + """ + Open or rebuild the index as needed. + + For disk-based indexes, checks if rebuilding is needed due to schema + version or language changes. Registers custom tokenizers after opening. + Safe to call multiple times - subsequent calls are no-ops. + """ if self._index is not None: return if self._path is not None: @@ -206,7 +265,11 @@ class TantivyBackend: self._schema = self._index.schema def close(self) -> None: - """Close the index. Idempotent.""" + """ + Close the index and release resources. + + Safe to call multiple times - subsequent calls are no-ops. + """ self._index = None self._schema = None @@ -339,13 +402,30 @@ class TantivyBackend: document: Document, effective_content: str | None = None, ) -> None: - """Add or update a single document with file locking.""" + """ + Add or update a single document with file locking. + + Convenience method for single-document updates. For bulk operations, + use batch_update() context manager for better performance. + + Args: + document: Django Document instance to index + effective_content: Override document.content for indexing + """ self._ensure_open() with self.batch_update(lock_timeout=5.0) as batch: batch.add_or_update(document, effective_content) def remove(self, doc_id: int) -> None: - """Remove a single document with file locking.""" + """ + Remove a single document from the index with file locking. + + Convenience method for single-document removal. For bulk operations, + use batch_update() context manager for better performance. + + Args: + doc_id: Primary key of the document to remove + """ self._ensure_open() with self.batch_update(lock_timeout=5.0) as batch: batch.remove(doc_id) @@ -360,7 +440,24 @@ class TantivyBackend: *, sort_reverse: bool, ) -> SearchResults: - """Search the index.""" + """ + Execute a search query against the document index. + + Processes the user query through date rewriting, normalization, and + permission filtering before executing against Tantivy. Supports both + relevance-based and field-based sorting. + + Args: + query: User's search query (supports natural date keywords, field filters) + user: User for permission filtering (None for superuser/no filtering) + page: Page number (1-indexed) for pagination + page_size: Number of results per page + sort_field: Field to sort by (None for relevance ranking) + sort_reverse: Whether to reverse the sort order + + Returns: + SearchResults with hits, total count, and processed query + """ self._ensure_open() tz = get_current_timezone() user_query = parse_user_query(self._index, query, tz) @@ -491,7 +588,21 @@ class TantivyBackend: limit: int, user: AbstractBaseUser | None = None, ) -> list[str]: - """Get autocomplete suggestions, optionally filtered by user visibility.""" + """ + Get autocomplete suggestions for search queries. + + Returns words that start with the given term prefix, ranked by document + frequency (how many documents contain each word). Optionally filters + results to only words from documents visible to the specified user. + + Args: + term: Prefix to match against autocomplete words + limit: Maximum number of suggestions to return + user: User for permission filtering (None for no filtering) + + Returns: + List of word suggestions ordered by frequency, then alphabetically + """ self._ensure_open() normalized_term = _ascii_fold(term.lower()) @@ -533,7 +644,21 @@ class TantivyBackend: page: int, page_size: int, ) -> SearchResults: - """Find documents similar to the given document.""" + """ + Find documents similar to the given document using content analysis. + + Uses Tantivy's "more like this" query to find documents with similar + content patterns. The original document is excluded from results. + + Args: + doc_id: Primary key of the reference document + user: User for permission filtering (None for no filtering) + page: Page number (1-indexed) for pagination + page_size: Number of results per page + + Returns: + SearchResults with similar documents (excluding the original) + """ self._ensure_open() searcher = self._index.searcher() @@ -621,12 +746,36 @@ class TantivyBackend: ) def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch: - """Get a batch context manager for bulk operations.""" + """ + Get a batch context manager for bulk index operations. + + Use this for efficient bulk document updates/deletions. All operations + within the batch are committed atomically at the end of the context. + + Args: + lock_timeout: Seconds to wait for file lock acquisition + + Returns: + WriteBatch context manager + + Raises: + SearchIndexLockError: If lock cannot be acquired within timeout + """ self._ensure_open() return WriteBatch(self, lock_timeout) def rebuild(self, documents: QuerySet, iter_wrapper: Callable = _identity) -> None: - """Rebuild the entire search index.""" + """ + Rebuild the entire search index from scratch. + + Wipes the existing index and re-indexes all provided documents. + On failure, restores the previous index state to keep the backend usable. + + Args: + documents: QuerySet of Document instances to index + iter_wrapper: Optional wrapper function for progress tracking + (e.g., progress bar). Should yield each document unchanged. + """ # Create new index (on-disk or in-memory) if self._path is not None: wipe_index(self._path) @@ -662,11 +811,15 @@ _backend_lock = threading.RLock() def get_backend() -> TantivyBackend: - """Get the global backend instance with thread safety. + """ + Get the global backend instance with thread safety. - Automatically reinitializes when settings.INDEX_DIR changes — this fixes - the xdist/override_settings isolation issue where each test may set a - different INDEX_DIR but would otherwise share a stale singleton. + Returns a singleton TantivyBackend instance, automatically reinitializing + when settings.INDEX_DIR changes. This ensures proper test isolation when + using pytest-xdist or @override_settings that change the index directory. + + Returns: + Thread-safe singleton TantivyBackend instance """ global _backend, _backend_path @@ -693,7 +846,12 @@ def get_backend() -> TantivyBackend: def reset_backend() -> None: - """Reset the global backend instance with thread safety.""" + """ + Reset the global backend instance with thread safety. + + Forces creation of a new backend instance on the next get_backend() call. + Used for test isolation and when switching between different index directories. + """ global _backend, _backend_path with _backend_lock: diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index b58ebe430..4dfd025a4 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -273,9 +273,24 @@ def _rewrite_8digit_date(query: str, tz: tzinfo) -> str: def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str: """ - Preprocessing stage 1: rewrite Whoosh compact dates, relative ranges, - and natural date keywords (field:today etc.) to ISO 8601. - Bare keywords without a field: prefix pass through unchanged. + Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility. + + Performs the first stage of query preprocessing, converting various date + formats and keywords to ISO 8601 datetime ranges that Tantivy can parse: + - Compact 14-digit dates (YYYYMMDDHHmmss) + - Whoosh relative ranges ([-7 days to now], [now-1h TO now+2h]) + - 8-digit dates with field awareness (created:20240115) + - Natural keywords (field:today, field:last_week, etc.) + + Args: + query: Raw user query string + tz: Timezone for converting local date boundaries to UTC + + Returns: + Query with date syntax rewritten to ISO 8601 ranges + + Note: + Bare keywords without field prefixes pass through unchanged. """ query = _rewrite_compact_date(query) query = _rewrite_whoosh_relative_range(query) @@ -293,8 +308,18 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str: def normalize_query(query: str) -> str: """ - Join comma-separated field values with AND, collapse whitespace. - tag:foo,bar → tag:foo AND tag:bar + Normalize query syntax for better search behavior. + + Expands comma-separated field values to explicit AND clauses and + collapses excessive whitespace for cleaner parsing: + - tag:foo,bar → tag:foo AND tag:bar + - multiple spaces → single spaces + + Args: + query: Query string after date rewriting + + Returns: + Normalized query string ready for Tantivy parsing """ def _expand(m: re.Match[str]) -> str: @@ -314,24 +339,27 @@ def build_permission_filter( user: AbstractBaseUser, ) -> tantivy.Query: """ - Returns a Query matching documents visible to user: - - no owner (public) → owner_id field absent (NULL in Django) - - owned by user → owner_id = user.pk - - shared with user → viewer_id = user.pk + Build a query filter for user document permissions. - Uses disjunction_max_query — boolean Should-only would match all docs. + Creates a query that matches only documents visible to the specified user + according to paperless-ngx permission rules: + - Public documents (no owner) are visible to all users + - Private documents are visible to their owner + - Documents explicitly shared with the user are visible - NOTE: all integer queries use range_query, not term_query, to avoid the - unsigned type-detection bug in tantivy-py 0.25 (lib.rs#L190 infers i64 - before u64; confirmed empirically — term_query returns 0 for u64 fields). - Same root cause as issue #47 (from_dict) but the term_query path unfixed. - See: https://github.com/quickwit-oss/tantivy-py/blob/f51d851e857385ad2907241fbce8cf08309c3078/src/lib.rs#L190 - https://github.com/quickwit-oss/tantivy-py/issues/47 + Args: + schema: Tantivy schema for field validation + user: User to check permissions for - NOTE: no_owner uses boolean_query([Must(all), MustNot(range)]) because - exists_query is not available in 0.25.1. It is present in master and can - simplify this to MustNot(exists_query("owner_id")) once released. - See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi + Returns: + Tantivy query that filters results to visible documents + + Implementation Notes: + - Uses range_query instead of term_query to work around unsigned integer + type detection bug in tantivy-py 0.25 + - Uses boolean_query for "no owner" check since exists_query is not + available in tantivy-py 0.25.1 (available in master) + - Uses disjunction_max_query to combine permission clauses with OR logic """ owner_any = tantivy.Query.range_query( schema, @@ -380,12 +408,28 @@ def parse_user_query( raw_query: str, tz: tzinfo, ) -> tantivy.Query: - """Run the full query preprocessing pipeline: date rewriting → normalisation → Tantivy parse. + """ + Parse user query through the complete preprocessing pipeline. - When ADVANCED_FUZZY_SEARCH_THRESHOLD is set (any float), a fuzzy query is blended in as a - Should clause boosted at 0.1 — keeping fuzzy hits ranked below exact matches. The fuzzy - query uses edit-distance=1, prefix=True, transposition_cost_one=True on all search fields. - The threshold float is a post-search minimum-score filter applied in the backend layer, not here. + Transforms the raw user query through multiple stages: + 1. Date keyword rewriting (today → ISO 8601 ranges) + 2. Query normalization (comma expansion, whitespace cleanup) + 3. Tantivy parsing with field boosts + 4. Optional fuzzy query blending (if ADVANCED_FUZZY_SEARCH_THRESHOLD set) + + Args: + index: Tantivy index with registered tokenizers + raw_query: Original user query string + tz: Timezone for date boundary calculations + + Returns: + Parsed Tantivy query ready for execution + + Note: + When ADVANCED_FUZZY_SEARCH_THRESHOLD is configured, adds a low-priority + fuzzy query as a Should clause (0.1 boost) to catch approximate matches + while keeping exact matches ranked higher. The threshold value is applied + as a post-search score filter, not during query construction. """ query_str = rewrite_natural_date_keywords(raw_query, tz) diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py index ef7e4a921..18575cb4c 100644 --- a/src/documents/search/_schema.py +++ b/src/documents/search/_schema.py @@ -16,7 +16,16 @@ SCHEMA_VERSION = 1 def build_schema() -> tantivy.Schema: - """Build the Tantivy schema for the paperless document index.""" + """ + Build the Tantivy schema for the paperless document index. + + Creates a comprehensive schema supporting full-text search, filtering, + sorting, and autocomplete functionality. Includes fields for document + content, metadata, permissions, custom fields, and notes. + + Returns: + Configured Tantivy schema ready for index creation + """ sb = tantivy.SchemaBuilder() sb.add_unsigned_field("id", stored=True, indexed=True, fast=True) @@ -79,7 +88,19 @@ def build_schema() -> tantivy.Schema: def needs_rebuild(index_dir: Path) -> bool: - """Check if the search index needs rebuilding by comparing schema version and language sentinel files.""" + """ + Check if the search index needs rebuilding. + + Compares the current schema version and search language configuration + against sentinel files to determine if the index is compatible with + the current paperless-ngx version and settings. + + Args: + index_dir: Path to the search index directory + + Returns: + True if the index needs rebuilding, False if it's up to date + """ version_file = index_dir / ".schema_version" if not version_file.exists(): return True @@ -102,7 +123,15 @@ def needs_rebuild(index_dir: Path) -> bool: def wipe_index(index_dir: Path) -> None: - """Delete all children in the index directory to prepare for rebuild.""" + """ + Delete all contents of the index directory to prepare for rebuild. + + Recursively removes all files and subdirectories within the index + directory while preserving the directory itself. + + Args: + index_dir: Path to the search index directory to clear + """ for child in list(index_dir.iterdir()): if child.is_dir(): shutil.rmtree(child) @@ -118,9 +147,17 @@ def _write_sentinels(index_dir: Path) -> None: def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index: """ - Open the Tantivy index at index_dir (defaults to settings.INDEX_DIR), - creating or rebuilding as needed. - Caller must register custom tokenizers after receiving the Index. + Open the Tantivy index, creating or rebuilding as needed. + + Checks if the index needs rebuilding due to schema version or language + changes. If rebuilding is needed, wipes the directory and creates a fresh + index with the current schema and configuration. + + Args: + index_dir: Path to index directory (defaults to settings.INDEX_DIR) + + Returns: + Opened Tantivy index (caller must register custom tokenizers) """ if index_dir is None: index_dir = settings.INDEX_DIR diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py index 628b9dcdb..e597a879e 100644 --- a/src/documents/search/_tokenizer.py +++ b/src/documents/search/_tokenizer.py @@ -51,13 +51,21 @@ SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP) def register_tokenizers(index: tantivy.Index, language: str | None) -> None: """ - Register all custom tokenizers on *index*. Must be called on every Index - instance — tantivy requires re-registration at each open. + Register all custom tokenizers required by the paperless schema. - simple_analyzer is also registered as a fast-field tokenizer because the - sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True. - Tantivy writes default values for fast columns on every commit, even for - documents that omit those fields, so the fast-field tokenizer must exist. + Must be called on every Index instance since Tantivy requires tokenizer + re-registration after each index open/creation. Registers tokenizers for + full-text search, sorting, CJK language support, and fast-field indexing. + + Args: + index: Tantivy index instance to register tokenizers on + language: ISO 639-1 language code for stemming (None to disable) + + Note: + simple_analyzer is registered as both a text and fast-field tokenizer + since sort shadow fields (title_sort, correspondent_sort, type_sort) + use fast=True and Tantivy requires fast-field tokenizers to exist + even for documents that omit those fields. """ index.register_tokenizer("paperless_text", _paperless_text(language)) index.register_tokenizer("simple_analyzer", _simple_analyzer()) diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index 23adfda85..f540d6d51 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -16,7 +16,7 @@ class TestWriteBatch: """Test WriteBatch context manager functionality.""" def test_rolls_back_on_exception(self, backend: TantivyBackend): - """Data integrity: a mid-batch exception must not corrupt the index.""" + """Batch operations must rollback on exception to preserve index integrity.""" doc = Document.objects.create( title="Rollback Target", content="should survive", @@ -47,7 +47,7 @@ class TestSearch: """Test search functionality.""" def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend): - """UI score bar depends on the top hit being 1.0.""" + """Search scores must be normalized so top hit has score 1.0 for UI consistency.""" for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]): doc = Document.objects.create( title=title, @@ -68,7 +68,7 @@ class TestSearch: assert all(0.0 <= h["score"] <= 1.0 for h in r.hits) def test_owner_filter(self, backend: TantivyBackend): - """Owner can find their document; other user cannot.""" + """Document owners can search their private documents; other users cannot access them.""" owner = User.objects.create_user("owner") other = User.objects.create_user("other") doc = Document.objects.create( @@ -108,7 +108,7 @@ class TestRebuild: """Test index rebuilding functionality.""" def test_with_iter_wrapper_called(self, backend: TantivyBackend): - """rebuild() must pass documents through iter_wrapper.""" + """Index rebuild must pass documents through iter_wrapper for progress tracking.""" seen = [] def wrapper(docs): @@ -125,7 +125,7 @@ class TestAutocomplete: """Test autocomplete functionality.""" def test_basic_functionality(self, backend: TantivyBackend): - """Autocomplete should find word prefixes.""" + """Autocomplete must return words matching the given prefix.""" doc = Document.objects.create( title="Invoice from Microsoft Corporation", content="payment details", @@ -138,7 +138,7 @@ class TestAutocomplete: assert "microsoft" in results def test_results_ordered_by_document_frequency(self, backend: TantivyBackend): - """Most-used prefix match should rank first.""" + """Autocomplete results must be ordered by document frequency to prioritize common terms.""" # "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should # return "payment" before "payslip". for i, (title, checksum) in enumerate( @@ -166,7 +166,7 @@ class TestMoreLikeThis: """Test more like this functionality.""" def test_excludes_original(self, backend: TantivyBackend): - """More like this should not return the original document.""" + """More like this queries must exclude the reference document from results.""" doc1 = Document.objects.create( title="Important document", content="financial information", @@ -197,9 +197,11 @@ class TestSingleton: reset_backend() def test_returns_same_instance_on_repeated_calls(self, index_dir): + """Singleton pattern: repeated calls to get_backend() must return the same instance.""" assert get_backend() is get_backend() def test_reinitializes_when_index_dir_changes(self, tmp_path, settings): + """Backend singleton must reinitialize when INDEX_DIR setting changes for test isolation.""" settings.INDEX_DIR = tmp_path / "a" (tmp_path / "a").mkdir() b1 = get_backend() @@ -212,6 +214,7 @@ class TestSingleton: assert b2._path == tmp_path / "b" def test_reset_forces_new_instance(self, index_dir): + """reset_backend() must force creation of a new backend instance on next get_backend() call.""" b1 = get_backend() reset_backend() b2 = get_backend() @@ -222,7 +225,7 @@ class TestFieldHandling: """Test handling of various document fields.""" def test_none_values_handled_correctly(self, backend: TantivyBackend): - """Test that None values for original_filename and page_count are handled properly.""" + """Document fields with None values must not cause indexing errors.""" doc = Document.objects.create( title="Test Doc", content="test content", @@ -245,7 +248,7 @@ class TestFieldHandling: assert results.total == 1 def test_custom_fields_include_name_and_value(self, backend: TantivyBackend): - """Custom field indexing should include both name and value.""" + """Custom fields must be indexed with both field name and value for structured queries.""" # Create a custom field field = CustomField.objects.create( name="Invoice Number", @@ -277,7 +280,7 @@ class TestFieldHandling: assert results.total == 1 def test_notes_include_user_information(self, backend: TantivyBackend): - """Notes should include user information when available.""" + """Notes must be indexed with user information when available for structured queries.""" user = User.objects.create_user("notewriter") doc = Document.objects.create( title="Doc with notes", diff --git a/src/documents/tests/search/test_tokenizer.py b/src/documents/tests/search/test_tokenizer.py index 6728784ae..aee52a567 100644 --- a/src/documents/tests/search/test_tokenizer.py +++ b/src/documents/tests/search/test_tokenizer.py @@ -45,7 +45,7 @@ class TestTokenizers: self, content_index: tantivy.Index, ) -> None: - """paperless_text normalises diacritics so café is findable as cafe.""" + """ASCII folding allows searching accented text with plain ASCII queries.""" writer = content_index.writer() doc = tantivy.Document() doc.add_text("content", "café résumé") @@ -56,7 +56,7 @@ class TestTokenizers: assert content_index.searcher().search(q, limit=5).count == 1 def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None: - """bigram_analyzer makes CJK substrings searchable without whitespace.""" + """Bigram tokenizer enables substring search in CJK languages without whitespace delimiters.""" writer = bigram_index.writer() doc = tantivy.Document() doc.add_text("bigram_content", "東京都") @@ -67,6 +67,7 @@ class TestTokenizers: assert bigram_index.searcher().search(q, limit=5).count == 1 def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None: + """Unsupported language codes should log a warning and disable stemming gracefully.""" sb = tantivy.SchemaBuilder() sb.add_text_field("content", stored=True, tokenizer_name="paperless_text") schema = sb.build() diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py index f6c8ba904..3e61b74f8 100644 --- a/src/documents/tests/test_management.py +++ b/src/documents/tests/test_management.py @@ -106,6 +106,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase): @pytest.mark.django_db class TestMakeIndex: def test_reindex(self, mocker: MockerFixture) -> None: + """Reindex command must call the backend rebuild method to recreate the index.""" mock_get_backend = mocker.patch( "documents.management.commands.document_index.get_backend", ) @@ -113,12 +114,14 @@ class TestMakeIndex: mock_get_backend.return_value.rebuild.assert_called_once() def test_optimize(self) -> None: + """Optimize command must execute without error (Tantivy handles optimization automatically).""" call_command("document_index", "optimize", skip_checks=True) def test_reindex_if_needed_skips_when_up_to_date( self, mocker: MockerFixture, ) -> None: + """Conditional reindex must skip rebuild when schema version and language match.""" mocker.patch( "documents.management.commands.document_index.needs_rebuild", return_value=False, @@ -133,6 +136,7 @@ class TestMakeIndex: self, mocker: MockerFixture, ) -> None: + """Conditional reindex must proceed with rebuild when schema version or language changed.""" mocker.patch( "documents.management.commands.document_index.needs_rebuild", return_value=True, diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index d73481609..9fb9ddbc6 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -23,8 +23,10 @@ from documents.tests.utils import DirectoriesMixin from documents.tests.utils import FileSystemAssertsMixin -class TestIndexOptimize(TestCase): +@pytest.mark.django_db +class TestIndexOptimize: def test_index_optimize(self) -> None: + """Index optimization task must execute without error (Tantivy handles optimization automatically).""" tasks.index_optimize()