mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-15 11:05:28 +00:00
docs: Enhance docstrings and test quality for Tantivy search backend
- Add comprehensive docstrings to all public methods and classes in the search package - Clarify purpose, parameters, return values, and implementation notes - Document thread safety, error handling, and usage patterns - Explain Tantivy-specific workarounds and design decisions - Improve test quality and pytest compliance - Add descriptive comments explaining what each test verifies - Convert TestIndexOptimize to pytest style with @pytest.mark.django_db - Ensure all test docstrings focus on behavior verification rather than implementation - Maintain existing functionality while improving code documentation - No changes to production logic or test coverage - All tests continue to pass with enhanced clarity Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,13 @@ logger = logging.getLogger("paperless.management.document_index")
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
"""
|
||||
Django management command for search index operations.
|
||||
|
||||
Provides subcommands for reindexing documents and optimizing the search index.
|
||||
Supports conditional reindexing based on schema version and language changes.
|
||||
"""
|
||||
|
||||
help = "Manages the document index."
|
||||
|
||||
supports_progress_bar = True
|
||||
|
||||
@@ -46,12 +46,17 @@ T = TypeVar("T")
|
||||
|
||||
|
||||
def _identity(iterable: Iterable[T]) -> Iterable[T]:
|
||||
"""Default iter_wrapper that passes through unchanged."""
|
||||
"""Default iter_wrapper that passes documents through unchanged for indexing."""
|
||||
return iterable
|
||||
|
||||
|
||||
def _ascii_fold(s: str) -> str:
|
||||
"""Normalize unicode to ASCII equivalent characters."""
|
||||
"""
|
||||
Normalize unicode to ASCII equivalent characters for search consistency.
|
||||
|
||||
Converts accented characters (e.g., "café") to their ASCII base forms ("cafe")
|
||||
to enable cross-language searching without requiring exact diacritic matching.
|
||||
"""
|
||||
return unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode()
|
||||
|
||||
|
||||
@@ -91,17 +96,33 @@ class SearchHit(TypedDict):
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SearchResults:
|
||||
"""
|
||||
Container for search results with pagination metadata.
|
||||
|
||||
Attributes:
|
||||
hits: List of search results with scores and highlights
|
||||
total: Total matching documents across all pages (for pagination)
|
||||
query: Preprocessed query string after date/syntax rewriting
|
||||
"""
|
||||
|
||||
hits: list[SearchHit]
|
||||
total: int # total matching documents (for pagination)
|
||||
query: str # preprocessed query string
|
||||
|
||||
|
||||
class TantivyRelevanceList:
|
||||
"""DRF-compatible list wrapper for Tantivy search hits.
|
||||
"""
|
||||
DRF-compatible list wrapper for Tantivy search hits.
|
||||
|
||||
__len__ returns the total hit count (for pagination); __getitem__ slices
|
||||
the hit list. Stores ALL post-filter hits so that get_all_result_ids()
|
||||
can return every matching doc ID without a second query.
|
||||
Provides paginated access to search results while storing all hits in memory
|
||||
for efficient ID retrieval. Used by Django REST framework for pagination.
|
||||
|
||||
Methods:
|
||||
__len__: Returns total hit count for pagination calculations
|
||||
__getitem__: Slices the hit list for page-specific results
|
||||
|
||||
Note: Stores ALL post-filter hits so get_all_result_ids() can return
|
||||
every matching document ID without requiring a second search query.
|
||||
"""
|
||||
|
||||
def __init__(self, hits: list[SearchHit]) -> None:
|
||||
@@ -115,11 +136,22 @@ class TantivyRelevanceList:
|
||||
|
||||
|
||||
class SearchIndexLockError(Exception):
|
||||
pass
|
||||
"""Raised when the search index file lock cannot be acquired within the timeout."""
|
||||
|
||||
|
||||
class WriteBatch:
|
||||
"""Context manager for bulk index operations with file locking."""
|
||||
"""
|
||||
Context manager for bulk index operations with file locking.
|
||||
|
||||
Provides transactional batch updates to the search index with proper
|
||||
concurrency control via file locking. All operations within the batch
|
||||
are committed atomically or rolled back on exception.
|
||||
|
||||
Usage:
|
||||
with backend.batch_update() as batch:
|
||||
batch.add_or_update(document)
|
||||
batch.remove(doc_id)
|
||||
"""
|
||||
|
||||
def __init__(self, backend: TantivyBackend, lock_timeout: float):
|
||||
self._backend = backend
|
||||
@@ -160,18 +192,29 @@ class WriteBatch:
|
||||
document: Document,
|
||||
effective_content: str | None = None,
|
||||
) -> None:
|
||||
"""Add or update a document in the batch.
|
||||
"""
|
||||
Add or update a document in the batch.
|
||||
|
||||
Tantivy has no native upsert — we delete by id then re-add so
|
||||
stale copies (e.g. after a permission change) don't linger.
|
||||
``effective_content`` overrides ``document.content`` for indexing.
|
||||
Implements upsert behavior by deleting any existing document with the same ID
|
||||
and adding the new version. This ensures stale document data (e.g., after
|
||||
permission changes) doesn't persist in the index.
|
||||
|
||||
Args:
|
||||
document: Django Document instance to index
|
||||
effective_content: Override document.content for indexing (used when
|
||||
re-indexing with newer OCR text from document versions)
|
||||
"""
|
||||
self.remove(document.pk)
|
||||
doc = self._backend._build_tantivy_doc(document, effective_content)
|
||||
self._writer.add_document(doc)
|
||||
|
||||
def remove(self, doc_id: int) -> None:
|
||||
"""Remove a document from the batch."""
|
||||
"""
|
||||
Remove a document from the batch by its primary key.
|
||||
|
||||
Uses range query instead of term query to work around unsigned integer
|
||||
type detection bug in tantivy-py 0.25.
|
||||
"""
|
||||
# Use range query to work around u64 deletion bug
|
||||
self._writer.delete_documents_by_query(
|
||||
tantivy.Query.range_query(
|
||||
@@ -185,7 +228,17 @@ class WriteBatch:
|
||||
|
||||
|
||||
class TantivyBackend:
|
||||
"""Tantivy search backend with explicit lifecycle management."""
|
||||
"""
|
||||
Tantivy search backend with explicit lifecycle management.
|
||||
|
||||
Provides full-text search capabilities using the Tantivy search engine.
|
||||
Supports in-memory indexes (for testing) and persistent on-disk indexes
|
||||
(for production use). Handles document indexing, search queries, autocompletion,
|
||||
and "more like this" functionality.
|
||||
|
||||
The backend manages its own connection lifecycle and can be reset when
|
||||
the underlying index directory changes (e.g., during test isolation).
|
||||
"""
|
||||
|
||||
def __init__(self, path: Path | None = None):
|
||||
# path=None → in-memory index (for tests)
|
||||
@@ -195,7 +248,13 @@ class TantivyBackend:
|
||||
self._schema = None
|
||||
|
||||
def open(self) -> None:
|
||||
"""Open or rebuild the index. Idempotent."""
|
||||
"""
|
||||
Open or rebuild the index as needed.
|
||||
|
||||
For disk-based indexes, checks if rebuilding is needed due to schema
|
||||
version or language changes. Registers custom tokenizers after opening.
|
||||
Safe to call multiple times - subsequent calls are no-ops.
|
||||
"""
|
||||
if self._index is not None:
|
||||
return
|
||||
if self._path is not None:
|
||||
@@ -206,7 +265,11 @@ class TantivyBackend:
|
||||
self._schema = self._index.schema
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the index. Idempotent."""
|
||||
"""
|
||||
Close the index and release resources.
|
||||
|
||||
Safe to call multiple times - subsequent calls are no-ops.
|
||||
"""
|
||||
self._index = None
|
||||
self._schema = None
|
||||
|
||||
@@ -339,13 +402,30 @@ class TantivyBackend:
|
||||
document: Document,
|
||||
effective_content: str | None = None,
|
||||
) -> None:
|
||||
"""Add or update a single document with file locking."""
|
||||
"""
|
||||
Add or update a single document with file locking.
|
||||
|
||||
Convenience method for single-document updates. For bulk operations,
|
||||
use batch_update() context manager for better performance.
|
||||
|
||||
Args:
|
||||
document: Django Document instance to index
|
||||
effective_content: Override document.content for indexing
|
||||
"""
|
||||
self._ensure_open()
|
||||
with self.batch_update(lock_timeout=5.0) as batch:
|
||||
batch.add_or_update(document, effective_content)
|
||||
|
||||
def remove(self, doc_id: int) -> None:
|
||||
"""Remove a single document with file locking."""
|
||||
"""
|
||||
Remove a single document from the index with file locking.
|
||||
|
||||
Convenience method for single-document removal. For bulk operations,
|
||||
use batch_update() context manager for better performance.
|
||||
|
||||
Args:
|
||||
doc_id: Primary key of the document to remove
|
||||
"""
|
||||
self._ensure_open()
|
||||
with self.batch_update(lock_timeout=5.0) as batch:
|
||||
batch.remove(doc_id)
|
||||
@@ -360,7 +440,24 @@ class TantivyBackend:
|
||||
*,
|
||||
sort_reverse: bool,
|
||||
) -> SearchResults:
|
||||
"""Search the index."""
|
||||
"""
|
||||
Execute a search query against the document index.
|
||||
|
||||
Processes the user query through date rewriting, normalization, and
|
||||
permission filtering before executing against Tantivy. Supports both
|
||||
relevance-based and field-based sorting.
|
||||
|
||||
Args:
|
||||
query: User's search query (supports natural date keywords, field filters)
|
||||
user: User for permission filtering (None for superuser/no filtering)
|
||||
page: Page number (1-indexed) for pagination
|
||||
page_size: Number of results per page
|
||||
sort_field: Field to sort by (None for relevance ranking)
|
||||
sort_reverse: Whether to reverse the sort order
|
||||
|
||||
Returns:
|
||||
SearchResults with hits, total count, and processed query
|
||||
"""
|
||||
self._ensure_open()
|
||||
tz = get_current_timezone()
|
||||
user_query = parse_user_query(self._index, query, tz)
|
||||
@@ -491,7 +588,21 @@ class TantivyBackend:
|
||||
limit: int,
|
||||
user: AbstractBaseUser | None = None,
|
||||
) -> list[str]:
|
||||
"""Get autocomplete suggestions, optionally filtered by user visibility."""
|
||||
"""
|
||||
Get autocomplete suggestions for search queries.
|
||||
|
||||
Returns words that start with the given term prefix, ranked by document
|
||||
frequency (how many documents contain each word). Optionally filters
|
||||
results to only words from documents visible to the specified user.
|
||||
|
||||
Args:
|
||||
term: Prefix to match against autocomplete words
|
||||
limit: Maximum number of suggestions to return
|
||||
user: User for permission filtering (None for no filtering)
|
||||
|
||||
Returns:
|
||||
List of word suggestions ordered by frequency, then alphabetically
|
||||
"""
|
||||
self._ensure_open()
|
||||
normalized_term = _ascii_fold(term.lower())
|
||||
|
||||
@@ -533,7 +644,21 @@ class TantivyBackend:
|
||||
page: int,
|
||||
page_size: int,
|
||||
) -> SearchResults:
|
||||
"""Find documents similar to the given document."""
|
||||
"""
|
||||
Find documents similar to the given document using content analysis.
|
||||
|
||||
Uses Tantivy's "more like this" query to find documents with similar
|
||||
content patterns. The original document is excluded from results.
|
||||
|
||||
Args:
|
||||
doc_id: Primary key of the reference document
|
||||
user: User for permission filtering (None for no filtering)
|
||||
page: Page number (1-indexed) for pagination
|
||||
page_size: Number of results per page
|
||||
|
||||
Returns:
|
||||
SearchResults with similar documents (excluding the original)
|
||||
"""
|
||||
self._ensure_open()
|
||||
searcher = self._index.searcher()
|
||||
|
||||
@@ -621,12 +746,36 @@ class TantivyBackend:
|
||||
)
|
||||
|
||||
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
|
||||
"""Get a batch context manager for bulk operations."""
|
||||
"""
|
||||
Get a batch context manager for bulk index operations.
|
||||
|
||||
Use this for efficient bulk document updates/deletions. All operations
|
||||
within the batch are committed atomically at the end of the context.
|
||||
|
||||
Args:
|
||||
lock_timeout: Seconds to wait for file lock acquisition
|
||||
|
||||
Returns:
|
||||
WriteBatch context manager
|
||||
|
||||
Raises:
|
||||
SearchIndexLockError: If lock cannot be acquired within timeout
|
||||
"""
|
||||
self._ensure_open()
|
||||
return WriteBatch(self, lock_timeout)
|
||||
|
||||
def rebuild(self, documents: QuerySet, iter_wrapper: Callable = _identity) -> None:
|
||||
"""Rebuild the entire search index."""
|
||||
"""
|
||||
Rebuild the entire search index from scratch.
|
||||
|
||||
Wipes the existing index and re-indexes all provided documents.
|
||||
On failure, restores the previous index state to keep the backend usable.
|
||||
|
||||
Args:
|
||||
documents: QuerySet of Document instances to index
|
||||
iter_wrapper: Optional wrapper function for progress tracking
|
||||
(e.g., progress bar). Should yield each document unchanged.
|
||||
"""
|
||||
# Create new index (on-disk or in-memory)
|
||||
if self._path is not None:
|
||||
wipe_index(self._path)
|
||||
@@ -662,11 +811,15 @@ _backend_lock = threading.RLock()
|
||||
|
||||
|
||||
def get_backend() -> TantivyBackend:
|
||||
"""Get the global backend instance with thread safety.
|
||||
"""
|
||||
Get the global backend instance with thread safety.
|
||||
|
||||
Automatically reinitializes when settings.INDEX_DIR changes — this fixes
|
||||
the xdist/override_settings isolation issue where each test may set a
|
||||
different INDEX_DIR but would otherwise share a stale singleton.
|
||||
Returns a singleton TantivyBackend instance, automatically reinitializing
|
||||
when settings.INDEX_DIR changes. This ensures proper test isolation when
|
||||
using pytest-xdist or @override_settings that change the index directory.
|
||||
|
||||
Returns:
|
||||
Thread-safe singleton TantivyBackend instance
|
||||
"""
|
||||
global _backend, _backend_path
|
||||
|
||||
@@ -693,7 +846,12 @@ def get_backend() -> TantivyBackend:
|
||||
|
||||
|
||||
def reset_backend() -> None:
|
||||
"""Reset the global backend instance with thread safety."""
|
||||
"""
|
||||
Reset the global backend instance with thread safety.
|
||||
|
||||
Forces creation of a new backend instance on the next get_backend() call.
|
||||
Used for test isolation and when switching between different index directories.
|
||||
"""
|
||||
global _backend, _backend_path
|
||||
|
||||
with _backend_lock:
|
||||
|
||||
@@ -273,9 +273,24 @@ def _rewrite_8digit_date(query: str, tz: tzinfo) -> str:
|
||||
|
||||
def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
Preprocessing stage 1: rewrite Whoosh compact dates, relative ranges,
|
||||
and natural date keywords (field:today etc.) to ISO 8601.
|
||||
Bare keywords without a field: prefix pass through unchanged.
|
||||
Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility.
|
||||
|
||||
Performs the first stage of query preprocessing, converting various date
|
||||
formats and keywords to ISO 8601 datetime ranges that Tantivy can parse:
|
||||
- Compact 14-digit dates (YYYYMMDDHHmmss)
|
||||
- Whoosh relative ranges ([-7 days to now], [now-1h TO now+2h])
|
||||
- 8-digit dates with field awareness (created:20240115)
|
||||
- Natural keywords (field:today, field:last_week, etc.)
|
||||
|
||||
Args:
|
||||
query: Raw user query string
|
||||
tz: Timezone for converting local date boundaries to UTC
|
||||
|
||||
Returns:
|
||||
Query with date syntax rewritten to ISO 8601 ranges
|
||||
|
||||
Note:
|
||||
Bare keywords without field prefixes pass through unchanged.
|
||||
"""
|
||||
query = _rewrite_compact_date(query)
|
||||
query = _rewrite_whoosh_relative_range(query)
|
||||
@@ -293,8 +308,18 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
|
||||
|
||||
def normalize_query(query: str) -> str:
|
||||
"""
|
||||
Join comma-separated field values with AND, collapse whitespace.
|
||||
tag:foo,bar → tag:foo AND tag:bar
|
||||
Normalize query syntax for better search behavior.
|
||||
|
||||
Expands comma-separated field values to explicit AND clauses and
|
||||
collapses excessive whitespace for cleaner parsing:
|
||||
- tag:foo,bar → tag:foo AND tag:bar
|
||||
- multiple spaces → single spaces
|
||||
|
||||
Args:
|
||||
query: Query string after date rewriting
|
||||
|
||||
Returns:
|
||||
Normalized query string ready for Tantivy parsing
|
||||
"""
|
||||
|
||||
def _expand(m: re.Match[str]) -> str:
|
||||
@@ -314,24 +339,27 @@ def build_permission_filter(
|
||||
user: AbstractBaseUser,
|
||||
) -> tantivy.Query:
|
||||
"""
|
||||
Returns a Query matching documents visible to user:
|
||||
- no owner (public) → owner_id field absent (NULL in Django)
|
||||
- owned by user → owner_id = user.pk
|
||||
- shared with user → viewer_id = user.pk
|
||||
Build a query filter for user document permissions.
|
||||
|
||||
Uses disjunction_max_query — boolean Should-only would match all docs.
|
||||
Creates a query that matches only documents visible to the specified user
|
||||
according to paperless-ngx permission rules:
|
||||
- Public documents (no owner) are visible to all users
|
||||
- Private documents are visible to their owner
|
||||
- Documents explicitly shared with the user are visible
|
||||
|
||||
NOTE: all integer queries use range_query, not term_query, to avoid the
|
||||
unsigned type-detection bug in tantivy-py 0.25 (lib.rs#L190 infers i64
|
||||
before u64; confirmed empirically — term_query returns 0 for u64 fields).
|
||||
Same root cause as issue #47 (from_dict) but the term_query path unfixed.
|
||||
See: https://github.com/quickwit-oss/tantivy-py/blob/f51d851e857385ad2907241fbce8cf08309c3078/src/lib.rs#L190
|
||||
https://github.com/quickwit-oss/tantivy-py/issues/47
|
||||
Args:
|
||||
schema: Tantivy schema for field validation
|
||||
user: User to check permissions for
|
||||
|
||||
NOTE: no_owner uses boolean_query([Must(all), MustNot(range)]) because
|
||||
exists_query is not available in 0.25.1. It is present in master and can
|
||||
simplify this to MustNot(exists_query("owner_id")) once released.
|
||||
See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi
|
||||
Returns:
|
||||
Tantivy query that filters results to visible documents
|
||||
|
||||
Implementation Notes:
|
||||
- Uses range_query instead of term_query to work around unsigned integer
|
||||
type detection bug in tantivy-py 0.25
|
||||
- Uses boolean_query for "no owner" check since exists_query is not
|
||||
available in tantivy-py 0.25.1 (available in master)
|
||||
- Uses disjunction_max_query to combine permission clauses with OR logic
|
||||
"""
|
||||
owner_any = tantivy.Query.range_query(
|
||||
schema,
|
||||
@@ -380,12 +408,28 @@ def parse_user_query(
|
||||
raw_query: str,
|
||||
tz: tzinfo,
|
||||
) -> tantivy.Query:
|
||||
"""Run the full query preprocessing pipeline: date rewriting → normalisation → Tantivy parse.
|
||||
"""
|
||||
Parse user query through the complete preprocessing pipeline.
|
||||
|
||||
When ADVANCED_FUZZY_SEARCH_THRESHOLD is set (any float), a fuzzy query is blended in as a
|
||||
Should clause boosted at 0.1 — keeping fuzzy hits ranked below exact matches. The fuzzy
|
||||
query uses edit-distance=1, prefix=True, transposition_cost_one=True on all search fields.
|
||||
The threshold float is a post-search minimum-score filter applied in the backend layer, not here.
|
||||
Transforms the raw user query through multiple stages:
|
||||
1. Date keyword rewriting (today → ISO 8601 ranges)
|
||||
2. Query normalization (comma expansion, whitespace cleanup)
|
||||
3. Tantivy parsing with field boosts
|
||||
4. Optional fuzzy query blending (if ADVANCED_FUZZY_SEARCH_THRESHOLD set)
|
||||
|
||||
Args:
|
||||
index: Tantivy index with registered tokenizers
|
||||
raw_query: Original user query string
|
||||
tz: Timezone for date boundary calculations
|
||||
|
||||
Returns:
|
||||
Parsed Tantivy query ready for execution
|
||||
|
||||
Note:
|
||||
When ADVANCED_FUZZY_SEARCH_THRESHOLD is configured, adds a low-priority
|
||||
fuzzy query as a Should clause (0.1 boost) to catch approximate matches
|
||||
while keeping exact matches ranked higher. The threshold value is applied
|
||||
as a post-search score filter, not during query construction.
|
||||
"""
|
||||
|
||||
query_str = rewrite_natural_date_keywords(raw_query, tz)
|
||||
|
||||
@@ -16,7 +16,16 @@ SCHEMA_VERSION = 1
|
||||
|
||||
|
||||
def build_schema() -> tantivy.Schema:
|
||||
"""Build the Tantivy schema for the paperless document index."""
|
||||
"""
|
||||
Build the Tantivy schema for the paperless document index.
|
||||
|
||||
Creates a comprehensive schema supporting full-text search, filtering,
|
||||
sorting, and autocomplete functionality. Includes fields for document
|
||||
content, metadata, permissions, custom fields, and notes.
|
||||
|
||||
Returns:
|
||||
Configured Tantivy schema ready for index creation
|
||||
"""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
|
||||
sb.add_unsigned_field("id", stored=True, indexed=True, fast=True)
|
||||
@@ -79,7 +88,19 @@ def build_schema() -> tantivy.Schema:
|
||||
|
||||
|
||||
def needs_rebuild(index_dir: Path) -> bool:
|
||||
"""Check if the search index needs rebuilding by comparing schema version and language sentinel files."""
|
||||
"""
|
||||
Check if the search index needs rebuilding.
|
||||
|
||||
Compares the current schema version and search language configuration
|
||||
against sentinel files to determine if the index is compatible with
|
||||
the current paperless-ngx version and settings.
|
||||
|
||||
Args:
|
||||
index_dir: Path to the search index directory
|
||||
|
||||
Returns:
|
||||
True if the index needs rebuilding, False if it's up to date
|
||||
"""
|
||||
version_file = index_dir / ".schema_version"
|
||||
if not version_file.exists():
|
||||
return True
|
||||
@@ -102,7 +123,15 @@ def needs_rebuild(index_dir: Path) -> bool:
|
||||
|
||||
|
||||
def wipe_index(index_dir: Path) -> None:
|
||||
"""Delete all children in the index directory to prepare for rebuild."""
|
||||
"""
|
||||
Delete all contents of the index directory to prepare for rebuild.
|
||||
|
||||
Recursively removes all files and subdirectories within the index
|
||||
directory while preserving the directory itself.
|
||||
|
||||
Args:
|
||||
index_dir: Path to the search index directory to clear
|
||||
"""
|
||||
for child in list(index_dir.iterdir()):
|
||||
if child.is_dir():
|
||||
shutil.rmtree(child)
|
||||
@@ -118,9 +147,17 @@ def _write_sentinels(index_dir: Path) -> None:
|
||||
|
||||
def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
|
||||
"""
|
||||
Open the Tantivy index at index_dir (defaults to settings.INDEX_DIR),
|
||||
creating or rebuilding as needed.
|
||||
Caller must register custom tokenizers after receiving the Index.
|
||||
Open the Tantivy index, creating or rebuilding as needed.
|
||||
|
||||
Checks if the index needs rebuilding due to schema version or language
|
||||
changes. If rebuilding is needed, wipes the directory and creates a fresh
|
||||
index with the current schema and configuration.
|
||||
|
||||
Args:
|
||||
index_dir: Path to index directory (defaults to settings.INDEX_DIR)
|
||||
|
||||
Returns:
|
||||
Opened Tantivy index (caller must register custom tokenizers)
|
||||
"""
|
||||
if index_dir is None:
|
||||
index_dir = settings.INDEX_DIR
|
||||
|
||||
@@ -51,13 +51,21 @@ SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
||||
|
||||
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||
"""
|
||||
Register all custom tokenizers on *index*. Must be called on every Index
|
||||
instance — tantivy requires re-registration at each open.
|
||||
Register all custom tokenizers required by the paperless schema.
|
||||
|
||||
simple_analyzer is also registered as a fast-field tokenizer because the
|
||||
sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True.
|
||||
Tantivy writes default values for fast columns on every commit, even for
|
||||
documents that omit those fields, so the fast-field tokenizer must exist.
|
||||
Must be called on every Index instance since Tantivy requires tokenizer
|
||||
re-registration after each index open/creation. Registers tokenizers for
|
||||
full-text search, sorting, CJK language support, and fast-field indexing.
|
||||
|
||||
Args:
|
||||
index: Tantivy index instance to register tokenizers on
|
||||
language: ISO 639-1 language code for stemming (None to disable)
|
||||
|
||||
Note:
|
||||
simple_analyzer is registered as both a text and fast-field tokenizer
|
||||
since sort shadow fields (title_sort, correspondent_sort, type_sort)
|
||||
use fast=True and Tantivy requires fast-field tokenizers to exist
|
||||
even for documents that omit those fields.
|
||||
"""
|
||||
index.register_tokenizer("paperless_text", _paperless_text(language))
|
||||
index.register_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
|
||||
@@ -16,7 +16,7 @@ class TestWriteBatch:
|
||||
"""Test WriteBatch context manager functionality."""
|
||||
|
||||
def test_rolls_back_on_exception(self, backend: TantivyBackend):
|
||||
"""Data integrity: a mid-batch exception must not corrupt the index."""
|
||||
"""Batch operations must rollback on exception to preserve index integrity."""
|
||||
doc = Document.objects.create(
|
||||
title="Rollback Target",
|
||||
content="should survive",
|
||||
@@ -47,7 +47,7 @@ class TestSearch:
|
||||
"""Test search functionality."""
|
||||
|
||||
def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
|
||||
"""UI score bar depends on the top hit being 1.0."""
|
||||
"""Search scores must be normalized so top hit has score 1.0 for UI consistency."""
|
||||
for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
|
||||
doc = Document.objects.create(
|
||||
title=title,
|
||||
@@ -68,7 +68,7 @@ class TestSearch:
|
||||
assert all(0.0 <= h["score"] <= 1.0 for h in r.hits)
|
||||
|
||||
def test_owner_filter(self, backend: TantivyBackend):
|
||||
"""Owner can find their document; other user cannot."""
|
||||
"""Document owners can search their private documents; other users cannot access them."""
|
||||
owner = User.objects.create_user("owner")
|
||||
other = User.objects.create_user("other")
|
||||
doc = Document.objects.create(
|
||||
@@ -108,7 +108,7 @@ class TestRebuild:
|
||||
"""Test index rebuilding functionality."""
|
||||
|
||||
def test_with_iter_wrapper_called(self, backend: TantivyBackend):
|
||||
"""rebuild() must pass documents through iter_wrapper."""
|
||||
"""Index rebuild must pass documents through iter_wrapper for progress tracking."""
|
||||
seen = []
|
||||
|
||||
def wrapper(docs):
|
||||
@@ -125,7 +125,7 @@ class TestAutocomplete:
|
||||
"""Test autocomplete functionality."""
|
||||
|
||||
def test_basic_functionality(self, backend: TantivyBackend):
|
||||
"""Autocomplete should find word prefixes."""
|
||||
"""Autocomplete must return words matching the given prefix."""
|
||||
doc = Document.objects.create(
|
||||
title="Invoice from Microsoft Corporation",
|
||||
content="payment details",
|
||||
@@ -138,7 +138,7 @@ class TestAutocomplete:
|
||||
assert "microsoft" in results
|
||||
|
||||
def test_results_ordered_by_document_frequency(self, backend: TantivyBackend):
|
||||
"""Most-used prefix match should rank first."""
|
||||
"""Autocomplete results must be ordered by document frequency to prioritize common terms."""
|
||||
# "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should
|
||||
# return "payment" before "payslip".
|
||||
for i, (title, checksum) in enumerate(
|
||||
@@ -166,7 +166,7 @@ class TestMoreLikeThis:
|
||||
"""Test more like this functionality."""
|
||||
|
||||
def test_excludes_original(self, backend: TantivyBackend):
|
||||
"""More like this should not return the original document."""
|
||||
"""More like this queries must exclude the reference document from results."""
|
||||
doc1 = Document.objects.create(
|
||||
title="Important document",
|
||||
content="financial information",
|
||||
@@ -197,9 +197,11 @@ class TestSingleton:
|
||||
reset_backend()
|
||||
|
||||
def test_returns_same_instance_on_repeated_calls(self, index_dir):
|
||||
"""Singleton pattern: repeated calls to get_backend() must return the same instance."""
|
||||
assert get_backend() is get_backend()
|
||||
|
||||
def test_reinitializes_when_index_dir_changes(self, tmp_path, settings):
|
||||
"""Backend singleton must reinitialize when INDEX_DIR setting changes for test isolation."""
|
||||
settings.INDEX_DIR = tmp_path / "a"
|
||||
(tmp_path / "a").mkdir()
|
||||
b1 = get_backend()
|
||||
@@ -212,6 +214,7 @@ class TestSingleton:
|
||||
assert b2._path == tmp_path / "b"
|
||||
|
||||
def test_reset_forces_new_instance(self, index_dir):
|
||||
"""reset_backend() must force creation of a new backend instance on next get_backend() call."""
|
||||
b1 = get_backend()
|
||||
reset_backend()
|
||||
b2 = get_backend()
|
||||
@@ -222,7 +225,7 @@ class TestFieldHandling:
|
||||
"""Test handling of various document fields."""
|
||||
|
||||
def test_none_values_handled_correctly(self, backend: TantivyBackend):
|
||||
"""Test that None values for original_filename and page_count are handled properly."""
|
||||
"""Document fields with None values must not cause indexing errors."""
|
||||
doc = Document.objects.create(
|
||||
title="Test Doc",
|
||||
content="test content",
|
||||
@@ -245,7 +248,7 @@ class TestFieldHandling:
|
||||
assert results.total == 1
|
||||
|
||||
def test_custom_fields_include_name_and_value(self, backend: TantivyBackend):
|
||||
"""Custom field indexing should include both name and value."""
|
||||
"""Custom fields must be indexed with both field name and value for structured queries."""
|
||||
# Create a custom field
|
||||
field = CustomField.objects.create(
|
||||
name="Invoice Number",
|
||||
@@ -277,7 +280,7 @@ class TestFieldHandling:
|
||||
assert results.total == 1
|
||||
|
||||
def test_notes_include_user_information(self, backend: TantivyBackend):
|
||||
"""Notes should include user information when available."""
|
||||
"""Notes must be indexed with user information when available for structured queries."""
|
||||
user = User.objects.create_user("notewriter")
|
||||
doc = Document.objects.create(
|
||||
title="Doc with notes",
|
||||
|
||||
@@ -45,7 +45,7 @@ class TestTokenizers:
|
||||
self,
|
||||
content_index: tantivy.Index,
|
||||
) -> None:
|
||||
"""paperless_text normalises diacritics so café is findable as cafe."""
|
||||
"""ASCII folding allows searching accented text with plain ASCII queries."""
|
||||
writer = content_index.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_text("content", "café résumé")
|
||||
@@ -56,7 +56,7 @@ class TestTokenizers:
|
||||
assert content_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None:
|
||||
"""bigram_analyzer makes CJK substrings searchable without whitespace."""
|
||||
"""Bigram tokenizer enables substring search in CJK languages without whitespace delimiters."""
|
||||
writer = bigram_index.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_text("bigram_content", "東京都")
|
||||
@@ -67,6 +67,7 @@ class TestTokenizers:
|
||||
assert bigram_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
||||
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
|
||||
schema = sb.build()
|
||||
|
||||
@@ -106,6 +106,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
@pytest.mark.django_db
|
||||
class TestMakeIndex:
|
||||
def test_reindex(self, mocker: MockerFixture) -> None:
|
||||
"""Reindex command must call the backend rebuild method to recreate the index."""
|
||||
mock_get_backend = mocker.patch(
|
||||
"documents.management.commands.document_index.get_backend",
|
||||
)
|
||||
@@ -113,12 +114,14 @@ class TestMakeIndex:
|
||||
mock_get_backend.return_value.rebuild.assert_called_once()
|
||||
|
||||
def test_optimize(self) -> None:
|
||||
"""Optimize command must execute without error (Tantivy handles optimization automatically)."""
|
||||
call_command("document_index", "optimize", skip_checks=True)
|
||||
|
||||
def test_reindex_if_needed_skips_when_up_to_date(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""Conditional reindex must skip rebuild when schema version and language match."""
|
||||
mocker.patch(
|
||||
"documents.management.commands.document_index.needs_rebuild",
|
||||
return_value=False,
|
||||
@@ -133,6 +136,7 @@ class TestMakeIndex:
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""Conditional reindex must proceed with rebuild when schema version or language changed."""
|
||||
mocker.patch(
|
||||
"documents.management.commands.document_index.needs_rebuild",
|
||||
return_value=True,
|
||||
|
||||
@@ -23,8 +23,10 @@ from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
|
||||
|
||||
class TestIndexOptimize(TestCase):
|
||||
@pytest.mark.django_db
|
||||
class TestIndexOptimize:
|
||||
def test_index_optimize(self) -> None:
|
||||
"""Index optimization task must execute without error (Tantivy handles optimization automatically)."""
|
||||
tasks.index_optimize()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user