docs: Enhance docstrings and test quality for Tantivy search backend

- Add comprehensive docstrings to all public methods and classes in the search package
  - Clarify purpose, parameters, return values, and implementation notes
  - Document thread safety, error handling, and usage patterns
  - Explain Tantivy-specific workarounds and design decisions

- Improve test quality and pytest compliance
  - Add descriptive comments explaining what each test verifies
  - Convert TestIndexOptimize to pytest style with @pytest.mark.django_db
  - Ensure all test docstrings focus on behavior verification rather than implementation

- Maintain existing functionality while improving code documentation
  - No changes to production logic or test coverage
  - All tests continue to pass with enhanced clarity

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-30 15:54:18 -07:00
parent 12eb9b9abf
commit e7f68c2082
9 changed files with 342 additions and 78 deletions
@@ -14,6 +14,13 @@ logger = logging.getLogger("paperless.management.document_index")
class Command(PaperlessCommand):
"""
Django management command for search index operations.
Provides subcommands for reindexing documents and optimizing the search index.
Supports conditional reindexing based on schema version and language changes.
"""
help = "Manages the document index."
supports_progress_bar = True
+186 -28
View File
@@ -46,12 +46,17 @@ T = TypeVar("T")
def _identity(iterable: Iterable[T]) -> Iterable[T]:
"""Default iter_wrapper that passes through unchanged."""
"""Default iter_wrapper that passes documents through unchanged for indexing."""
return iterable
def _ascii_fold(s: str) -> str:
"""Normalize unicode to ASCII equivalent characters."""
"""
Normalize unicode to ASCII equivalent characters for search consistency.
Converts accented characters (e.g., "café") to their ASCII base forms ("cafe")
to enable cross-language searching without requiring exact diacritic matching.
"""
return unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode()
@@ -91,17 +96,33 @@ class SearchHit(TypedDict):
@dataclass(frozen=True, slots=True)
class SearchResults:
"""
Container for search results with pagination metadata.
Attributes:
hits: List of search results with scores and highlights
total: Total matching documents across all pages (for pagination)
query: Preprocessed query string after date/syntax rewriting
"""
hits: list[SearchHit]
total: int # total matching documents (for pagination)
query: str # preprocessed query string
class TantivyRelevanceList:
"""DRF-compatible list wrapper for Tantivy search hits.
"""
DRF-compatible list wrapper for Tantivy search hits.
__len__ returns the total hit count (for pagination); __getitem__ slices
the hit list. Stores ALL post-filter hits so that get_all_result_ids()
can return every matching doc ID without a second query.
Provides paginated access to search results while storing all hits in memory
for efficient ID retrieval. Used by Django REST framework for pagination.
Methods:
__len__: Returns total hit count for pagination calculations
__getitem__: Slices the hit list for page-specific results
Note: Stores ALL post-filter hits so get_all_result_ids() can return
every matching document ID without requiring a second search query.
"""
def __init__(self, hits: list[SearchHit]) -> None:
@@ -115,11 +136,22 @@ class TantivyRelevanceList:
class SearchIndexLockError(Exception):
pass
"""Raised when the search index file lock cannot be acquired within the timeout."""
class WriteBatch:
"""Context manager for bulk index operations with file locking."""
"""
Context manager for bulk index operations with file locking.
Provides transactional batch updates to the search index with proper
concurrency control via file locking. All operations within the batch
are committed atomically or rolled back on exception.
Usage:
with backend.batch_update() as batch:
batch.add_or_update(document)
batch.remove(doc_id)
"""
def __init__(self, backend: TantivyBackend, lock_timeout: float):
self._backend = backend
@@ -160,18 +192,29 @@ class WriteBatch:
document: Document,
effective_content: str | None = None,
) -> None:
"""Add or update a document in the batch.
"""
Add or update a document in the batch.
Tantivy has no native upsert — we delete by id then re-add so
stale copies (e.g. after a permission change) don't linger.
``effective_content`` overrides ``document.content`` for indexing.
Implements upsert behavior by deleting any existing document with the same ID
and adding the new version. This ensures stale document data (e.g., after
permission changes) doesn't persist in the index.
Args:
document: Django Document instance to index
effective_content: Override document.content for indexing (used when
re-indexing with newer OCR text from document versions)
"""
self.remove(document.pk)
doc = self._backend._build_tantivy_doc(document, effective_content)
self._writer.add_document(doc)
def remove(self, doc_id: int) -> None:
"""Remove a document from the batch."""
"""
Remove a document from the batch by its primary key.
Uses range query instead of term query to work around unsigned integer
type detection bug in tantivy-py 0.25.
"""
# Use range query to work around u64 deletion bug
self._writer.delete_documents_by_query(
tantivy.Query.range_query(
@@ -185,7 +228,17 @@ class WriteBatch:
class TantivyBackend:
"""Tantivy search backend with explicit lifecycle management."""
"""
Tantivy search backend with explicit lifecycle management.
Provides full-text search capabilities using the Tantivy search engine.
Supports in-memory indexes (for testing) and persistent on-disk indexes
(for production use). Handles document indexing, search queries, autocompletion,
and "more like this" functionality.
The backend manages its own connection lifecycle and can be reset when
the underlying index directory changes (e.g., during test isolation).
"""
def __init__(self, path: Path | None = None):
# path=None → in-memory index (for tests)
@@ -195,7 +248,13 @@ class TantivyBackend:
self._schema = None
def open(self) -> None:
"""Open or rebuild the index. Idempotent."""
"""
Open or rebuild the index as needed.
For disk-based indexes, checks if rebuilding is needed due to schema
version or language changes. Registers custom tokenizers after opening.
Safe to call multiple times - subsequent calls are no-ops.
"""
if self._index is not None:
return
if self._path is not None:
@@ -206,7 +265,11 @@ class TantivyBackend:
self._schema = self._index.schema
def close(self) -> None:
"""Close the index. Idempotent."""
"""
Close the index and release resources.
Safe to call multiple times - subsequent calls are no-ops.
"""
self._index = None
self._schema = None
@@ -339,13 +402,30 @@ class TantivyBackend:
document: Document,
effective_content: str | None = None,
) -> None:
"""Add or update a single document with file locking."""
"""
Add or update a single document with file locking.
Convenience method for single-document updates. For bulk operations,
use batch_update() context manager for better performance.
Args:
document: Django Document instance to index
effective_content: Override document.content for indexing
"""
self._ensure_open()
with self.batch_update(lock_timeout=5.0) as batch:
batch.add_or_update(document, effective_content)
def remove(self, doc_id: int) -> None:
"""Remove a single document with file locking."""
"""
Remove a single document from the index with file locking.
Convenience method for single-document removal. For bulk operations,
use batch_update() context manager for better performance.
Args:
doc_id: Primary key of the document to remove
"""
self._ensure_open()
with self.batch_update(lock_timeout=5.0) as batch:
batch.remove(doc_id)
@@ -360,7 +440,24 @@ class TantivyBackend:
*,
sort_reverse: bool,
) -> SearchResults:
"""Search the index."""
"""
Execute a search query against the document index.
Processes the user query through date rewriting, normalization, and
permission filtering before executing against Tantivy. Supports both
relevance-based and field-based sorting.
Args:
query: User's search query (supports natural date keywords, field filters)
user: User for permission filtering (None for superuser/no filtering)
page: Page number (1-indexed) for pagination
page_size: Number of results per page
sort_field: Field to sort by (None for relevance ranking)
sort_reverse: Whether to reverse the sort order
Returns:
SearchResults with hits, total count, and processed query
"""
self._ensure_open()
tz = get_current_timezone()
user_query = parse_user_query(self._index, query, tz)
@@ -491,7 +588,21 @@ class TantivyBackend:
limit: int,
user: AbstractBaseUser | None = None,
) -> list[str]:
"""Get autocomplete suggestions, optionally filtered by user visibility."""
"""
Get autocomplete suggestions for search queries.
Returns words that start with the given term prefix, ranked by document
frequency (how many documents contain each word). Optionally filters
results to only words from documents visible to the specified user.
Args:
term: Prefix to match against autocomplete words
limit: Maximum number of suggestions to return
user: User for permission filtering (None for no filtering)
Returns:
List of word suggestions ordered by frequency, then alphabetically
"""
self._ensure_open()
normalized_term = _ascii_fold(term.lower())
@@ -533,7 +644,21 @@ class TantivyBackend:
page: int,
page_size: int,
) -> SearchResults:
"""Find documents similar to the given document."""
"""
Find documents similar to the given document using content analysis.
Uses Tantivy's "more like this" query to find documents with similar
content patterns. The original document is excluded from results.
Args:
doc_id: Primary key of the reference document
user: User for permission filtering (None for no filtering)
page: Page number (1-indexed) for pagination
page_size: Number of results per page
Returns:
SearchResults with similar documents (excluding the original)
"""
self._ensure_open()
searcher = self._index.searcher()
@@ -621,12 +746,36 @@ class TantivyBackend:
)
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
"""Get a batch context manager for bulk operations."""
"""
Get a batch context manager for bulk index operations.
Use this for efficient bulk document updates/deletions. All operations
within the batch are committed atomically at the end of the context.
Args:
lock_timeout: Seconds to wait for file lock acquisition
Returns:
WriteBatch context manager
Raises:
SearchIndexLockError: If lock cannot be acquired within timeout
"""
self._ensure_open()
return WriteBatch(self, lock_timeout)
def rebuild(self, documents: QuerySet, iter_wrapper: Callable = _identity) -> None:
"""Rebuild the entire search index."""
"""
Rebuild the entire search index from scratch.
Wipes the existing index and re-indexes all provided documents.
On failure, restores the previous index state to keep the backend usable.
Args:
documents: QuerySet of Document instances to index
iter_wrapper: Optional wrapper function for progress tracking
(e.g., progress bar). Should yield each document unchanged.
"""
# Create new index (on-disk or in-memory)
if self._path is not None:
wipe_index(self._path)
@@ -662,11 +811,15 @@ _backend_lock = threading.RLock()
def get_backend() -> TantivyBackend:
"""Get the global backend instance with thread safety.
"""
Get the global backend instance with thread safety.
Automatically reinitializes when settings.INDEX_DIR changes — this fixes
the xdist/override_settings isolation issue where each test may set a
different INDEX_DIR but would otherwise share a stale singleton.
Returns a singleton TantivyBackend instance, automatically reinitializing
when settings.INDEX_DIR changes. This ensures proper test isolation when
using pytest-xdist or @override_settings that change the index directory.
Returns:
Thread-safe singleton TantivyBackend instance
"""
global _backend, _backend_path
@@ -693,7 +846,12 @@ def get_backend() -> TantivyBackend:
def reset_backend() -> None:
"""Reset the global backend instance with thread safety."""
"""
Reset the global backend instance with thread safety.
Forces creation of a new backend instance on the next get_backend() call.
Used for test isolation and when switching between different index directories.
"""
global _backend, _backend_path
with _backend_lock:
+69 -25
View File
@@ -273,9 +273,24 @@ def _rewrite_8digit_date(query: str, tz: tzinfo) -> str:
def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
"""
Preprocessing stage 1: rewrite Whoosh compact dates, relative ranges,
and natural date keywords (field:today etc.) to ISO 8601.
Bare keywords without a field: prefix pass through unchanged.
Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility.
Performs the first stage of query preprocessing, converting various date
formats and keywords to ISO 8601 datetime ranges that Tantivy can parse:
- Compact 14-digit dates (YYYYMMDDHHmmss)
- Whoosh relative ranges ([-7 days to now], [now-1h TO now+2h])
- 8-digit dates with field awareness (created:20240115)
- Natural keywords (field:today, field:last_week, etc.)
Args:
query: Raw user query string
tz: Timezone for converting local date boundaries to UTC
Returns:
Query with date syntax rewritten to ISO 8601 ranges
Note:
Bare keywords without field prefixes pass through unchanged.
"""
query = _rewrite_compact_date(query)
query = _rewrite_whoosh_relative_range(query)
@@ -293,8 +308,18 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
def normalize_query(query: str) -> str:
"""
Join comma-separated field values with AND, collapse whitespace.
tag:foo,bar → tag:foo AND tag:bar
Normalize query syntax for better search behavior.
Expands comma-separated field values to explicit AND clauses and
collapses excessive whitespace for cleaner parsing:
- tag:foo,bar → tag:foo AND tag:bar
- multiple spaces → single spaces
Args:
query: Query string after date rewriting
Returns:
Normalized query string ready for Tantivy parsing
"""
def _expand(m: re.Match[str]) -> str:
@@ -314,24 +339,27 @@ def build_permission_filter(
user: AbstractBaseUser,
) -> tantivy.Query:
"""
Returns a Query matching documents visible to user:
- no owner (public) → owner_id field absent (NULL in Django)
- owned by user → owner_id = user.pk
- shared with user → viewer_id = user.pk
Build a query filter for user document permissions.
Uses disjunction_max_query — boolean Should-only would match all docs.
Creates a query that matches only documents visible to the specified user
according to paperless-ngx permission rules:
- Public documents (no owner) are visible to all users
- Private documents are visible to their owner
- Documents explicitly shared with the user are visible
NOTE: all integer queries use range_query, not term_query, to avoid the
unsigned type-detection bug in tantivy-py 0.25 (lib.rs#L190 infers i64
before u64; confirmed empirically — term_query returns 0 for u64 fields).
Same root cause as issue #47 (from_dict) but the term_query path unfixed.
See: https://github.com/quickwit-oss/tantivy-py/blob/f51d851e857385ad2907241fbce8cf08309c3078/src/lib.rs#L190
https://github.com/quickwit-oss/tantivy-py/issues/47
Args:
schema: Tantivy schema for field validation
user: User to check permissions for
NOTE: no_owner uses boolean_query([Must(all), MustNot(range)]) because
exists_query is not available in 0.25.1. It is present in master and can
simplify this to MustNot(exists_query("owner_id")) once released.
See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi
Returns:
Tantivy query that filters results to visible documents
Implementation Notes:
- Uses range_query instead of term_query to work around unsigned integer
type detection bug in tantivy-py 0.25
- Uses boolean_query for "no owner" check since exists_query is not
available in tantivy-py 0.25.1 (available in master)
- Uses disjunction_max_query to combine permission clauses with OR logic
"""
owner_any = tantivy.Query.range_query(
schema,
@@ -380,12 +408,28 @@ def parse_user_query(
raw_query: str,
tz: tzinfo,
) -> tantivy.Query:
"""Run the full query preprocessing pipeline: date rewriting → normalisation → Tantivy parse.
"""
Parse user query through the complete preprocessing pipeline.
When ADVANCED_FUZZY_SEARCH_THRESHOLD is set (any float), a fuzzy query is blended in as a
Should clause boosted at 0.1 — keeping fuzzy hits ranked below exact matches. The fuzzy
query uses edit-distance=1, prefix=True, transposition_cost_one=True on all search fields.
The threshold float is a post-search minimum-score filter applied in the backend layer, not here.
Transforms the raw user query through multiple stages:
1. Date keyword rewriting (today → ISO 8601 ranges)
2. Query normalization (comma expansion, whitespace cleanup)
3. Tantivy parsing with field boosts
4. Optional fuzzy query blending (if ADVANCED_FUZZY_SEARCH_THRESHOLD set)
Args:
index: Tantivy index with registered tokenizers
raw_query: Original user query string
tz: Timezone for date boundary calculations
Returns:
Parsed Tantivy query ready for execution
Note:
When ADVANCED_FUZZY_SEARCH_THRESHOLD is configured, adds a low-priority
fuzzy query as a Should clause (0.1 boost) to catch approximate matches
while keeping exact matches ranked higher. The threshold value is applied
as a post-search score filter, not during query construction.
"""
query_str = rewrite_natural_date_keywords(raw_query, tz)
+43 -6
View File
@@ -16,7 +16,16 @@ SCHEMA_VERSION = 1
def build_schema() -> tantivy.Schema:
"""Build the Tantivy schema for the paperless document index."""
"""
Build the Tantivy schema for the paperless document index.
Creates a comprehensive schema supporting full-text search, filtering,
sorting, and autocomplete functionality. Includes fields for document
content, metadata, permissions, custom fields, and notes.
Returns:
Configured Tantivy schema ready for index creation
"""
sb = tantivy.SchemaBuilder()
sb.add_unsigned_field("id", stored=True, indexed=True, fast=True)
@@ -79,7 +88,19 @@ def build_schema() -> tantivy.Schema:
def needs_rebuild(index_dir: Path) -> bool:
"""Check if the search index needs rebuilding by comparing schema version and language sentinel files."""
"""
Check if the search index needs rebuilding.
Compares the current schema version and search language configuration
against sentinel files to determine if the index is compatible with
the current paperless-ngx version and settings.
Args:
index_dir: Path to the search index directory
Returns:
True if the index needs rebuilding, False if it's up to date
"""
version_file = index_dir / ".schema_version"
if not version_file.exists():
return True
@@ -102,7 +123,15 @@ def needs_rebuild(index_dir: Path) -> bool:
def wipe_index(index_dir: Path) -> None:
"""Delete all children in the index directory to prepare for rebuild."""
"""
Delete all contents of the index directory to prepare for rebuild.
Recursively removes all files and subdirectories within the index
directory while preserving the directory itself.
Args:
index_dir: Path to the search index directory to clear
"""
for child in list(index_dir.iterdir()):
if child.is_dir():
shutil.rmtree(child)
@@ -118,9 +147,17 @@ def _write_sentinels(index_dir: Path) -> None:
def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
"""
Open the Tantivy index at index_dir (defaults to settings.INDEX_DIR),
creating or rebuilding as needed.
Caller must register custom tokenizers after receiving the Index.
Open the Tantivy index, creating or rebuilding as needed.
Checks if the index needs rebuilding due to schema version or language
changes. If rebuilding is needed, wipes the directory and creates a fresh
index with the current schema and configuration.
Args:
index_dir: Path to index directory (defaults to settings.INDEX_DIR)
Returns:
Opened Tantivy index (caller must register custom tokenizers)
"""
if index_dir is None:
index_dir = settings.INDEX_DIR
+14 -6
View File
@@ -51,13 +51,21 @@ SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
"""
Register all custom tokenizers on *index*. Must be called on every Index
instance — tantivy requires re-registration at each open.
Register all custom tokenizers required by the paperless schema.
simple_analyzer is also registered as a fast-field tokenizer because the
sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True.
Tantivy writes default values for fast columns on every commit, even for
documents that omit those fields, so the fast-field tokenizer must exist.
Must be called on every Index instance since Tantivy requires tokenizer
re-registration after each index open/creation. Registers tokenizers for
full-text search, sorting, CJK language support, and fast-field indexing.
Args:
index: Tantivy index instance to register tokenizers on
language: ISO 639-1 language code for stemming (None to disable)
Note:
simple_analyzer is registered as both a text and fast-field tokenizer
since sort shadow fields (title_sort, correspondent_sort, type_sort)
use fast=True and Tantivy requires fast-field tokenizers to exist
even for documents that omit those fields.
"""
index.register_tokenizer("paperless_text", _paperless_text(language))
index.register_tokenizer("simple_analyzer", _simple_analyzer())
+13 -10
View File
@@ -16,7 +16,7 @@ class TestWriteBatch:
"""Test WriteBatch context manager functionality."""
def test_rolls_back_on_exception(self, backend: TantivyBackend):
"""Data integrity: a mid-batch exception must not corrupt the index."""
"""Batch operations must rollback on exception to preserve index integrity."""
doc = Document.objects.create(
title="Rollback Target",
content="should survive",
@@ -47,7 +47,7 @@ class TestSearch:
"""Test search functionality."""
def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
"""UI score bar depends on the top hit being 1.0."""
"""Search scores must be normalized so top hit has score 1.0 for UI consistency."""
for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
doc = Document.objects.create(
title=title,
@@ -68,7 +68,7 @@ class TestSearch:
assert all(0.0 <= h["score"] <= 1.0 for h in r.hits)
def test_owner_filter(self, backend: TantivyBackend):
"""Owner can find their document; other user cannot."""
"""Document owners can search their private documents; other users cannot access them."""
owner = User.objects.create_user("owner")
other = User.objects.create_user("other")
doc = Document.objects.create(
@@ -108,7 +108,7 @@ class TestRebuild:
"""Test index rebuilding functionality."""
def test_with_iter_wrapper_called(self, backend: TantivyBackend):
"""rebuild() must pass documents through iter_wrapper."""
"""Index rebuild must pass documents through iter_wrapper for progress tracking."""
seen = []
def wrapper(docs):
@@ -125,7 +125,7 @@ class TestAutocomplete:
"""Test autocomplete functionality."""
def test_basic_functionality(self, backend: TantivyBackend):
"""Autocomplete should find word prefixes."""
"""Autocomplete must return words matching the given prefix."""
doc = Document.objects.create(
title="Invoice from Microsoft Corporation",
content="payment details",
@@ -138,7 +138,7 @@ class TestAutocomplete:
assert "microsoft" in results
def test_results_ordered_by_document_frequency(self, backend: TantivyBackend):
"""Most-used prefix match should rank first."""
"""Autocomplete results must be ordered by document frequency to prioritize common terms."""
# "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should
# return "payment" before "payslip".
for i, (title, checksum) in enumerate(
@@ -166,7 +166,7 @@ class TestMoreLikeThis:
"""Test more like this functionality."""
def test_excludes_original(self, backend: TantivyBackend):
"""More like this should not return the original document."""
"""More like this queries must exclude the reference document from results."""
doc1 = Document.objects.create(
title="Important document",
content="financial information",
@@ -197,9 +197,11 @@ class TestSingleton:
reset_backend()
def test_returns_same_instance_on_repeated_calls(self, index_dir):
"""Singleton pattern: repeated calls to get_backend() must return the same instance."""
assert get_backend() is get_backend()
def test_reinitializes_when_index_dir_changes(self, tmp_path, settings):
"""Backend singleton must reinitialize when INDEX_DIR setting changes for test isolation."""
settings.INDEX_DIR = tmp_path / "a"
(tmp_path / "a").mkdir()
b1 = get_backend()
@@ -212,6 +214,7 @@ class TestSingleton:
assert b2._path == tmp_path / "b"
def test_reset_forces_new_instance(self, index_dir):
"""reset_backend() must force creation of a new backend instance on next get_backend() call."""
b1 = get_backend()
reset_backend()
b2 = get_backend()
@@ -222,7 +225,7 @@ class TestFieldHandling:
"""Test handling of various document fields."""
def test_none_values_handled_correctly(self, backend: TantivyBackend):
"""Test that None values for original_filename and page_count are handled properly."""
"""Document fields with None values must not cause indexing errors."""
doc = Document.objects.create(
title="Test Doc",
content="test content",
@@ -245,7 +248,7 @@ class TestFieldHandling:
assert results.total == 1
def test_custom_fields_include_name_and_value(self, backend: TantivyBackend):
"""Custom field indexing should include both name and value."""
"""Custom fields must be indexed with both field name and value for structured queries."""
# Create a custom field
field = CustomField.objects.create(
name="Invoice Number",
@@ -277,7 +280,7 @@ class TestFieldHandling:
assert results.total == 1
def test_notes_include_user_information(self, backend: TantivyBackend):
"""Notes should include user information when available."""
"""Notes must be indexed with user information when available for structured queries."""
user = User.objects.create_user("notewriter")
doc = Document.objects.create(
title="Doc with notes",
+3 -2
View File
@@ -45,7 +45,7 @@ class TestTokenizers:
self,
content_index: tantivy.Index,
) -> None:
"""paperless_text normalises diacritics so café is findable as cafe."""
"""ASCII folding allows searching accented text with plain ASCII queries."""
writer = content_index.writer()
doc = tantivy.Document()
doc.add_text("content", "café résumé")
@@ -56,7 +56,7 @@ class TestTokenizers:
assert content_index.searcher().search(q, limit=5).count == 1
def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None:
"""bigram_analyzer makes CJK substrings searchable without whitespace."""
"""Bigram tokenizer enables substring search in CJK languages without whitespace delimiters."""
writer = bigram_index.writer()
doc = tantivy.Document()
doc.add_text("bigram_content", "東京都")
@@ -67,6 +67,7 @@ class TestTokenizers:
assert bigram_index.searcher().search(q, limit=5).count == 1
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
"""Unsupported language codes should log a warning and disable stemming gracefully."""
sb = tantivy.SchemaBuilder()
sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
schema = sb.build()
+4
View File
@@ -106,6 +106,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@pytest.mark.django_db
class TestMakeIndex:
def test_reindex(self, mocker: MockerFixture) -> None:
"""Reindex command must call the backend rebuild method to recreate the index."""
mock_get_backend = mocker.patch(
"documents.management.commands.document_index.get_backend",
)
@@ -113,12 +114,14 @@ class TestMakeIndex:
mock_get_backend.return_value.rebuild.assert_called_once()
def test_optimize(self) -> None:
"""Optimize command must execute without error (Tantivy handles optimization automatically)."""
call_command("document_index", "optimize", skip_checks=True)
def test_reindex_if_needed_skips_when_up_to_date(
self,
mocker: MockerFixture,
) -> None:
"""Conditional reindex must skip rebuild when schema version and language match."""
mocker.patch(
"documents.management.commands.document_index.needs_rebuild",
return_value=False,
@@ -133,6 +136,7 @@ class TestMakeIndex:
self,
mocker: MockerFixture,
) -> None:
"""Conditional reindex must proceed with rebuild when schema version or language changed."""
mocker.patch(
"documents.management.commands.document_index.needs_rebuild",
return_value=True,
+3 -1
View File
@@ -23,8 +23,10 @@ from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
class TestIndexOptimize(TestCase):
@pytest.mark.django_db
class TestIndexOptimize:
def test_index_optimize(self) -> None:
"""Index optimization task must execute without error (Tantivy handles optimization automatically)."""
tasks.index_optimize()