Files
paperless-ngx/src/documents/search/_backend.py
Trenton Holmes 0b5b6fdad5 refactor: extract _parse_query and _apply_permission_filter helpers
Deduplicates query parsing (3 call sites) and permission filter
wrapping (4 call sites) into private helper methods on TantivyBackend.
Also documents the N-lookup limitation of highlight_hits().

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-05 13:31:37 -07:00

1155 lines
40 KiB
Python

from __future__ import annotations
import logging
import threading
from collections import Counter
from dataclasses import dataclass
from datetime import UTC
from datetime import datetime
from enum import StrEnum
from typing import TYPE_CHECKING
from typing import Self
from typing import TypedDict
from typing import TypeVar
import filelock
import regex
import tantivy
from django.conf import settings
from django.utils.timezone import get_current_timezone
from guardian.shortcuts import get_users_with_perms
from documents.search._normalize import ascii_fold
from documents.search._query import build_permission_filter
from documents.search._query import parse_simple_text_query
from documents.search._query import parse_simple_title_query
from documents.search._query import parse_user_query
from documents.search._schema import _write_sentinels
from documents.search._schema import build_schema
from documents.search._schema import open_or_rebuild_index
from documents.search._schema import wipe_index
from documents.search._tokenizer import register_tokenizers
from documents.utils import IterWrapper
from documents.utils import identity
if TYPE_CHECKING:
from pathlib import Path
from django.contrib.auth.base_user import AbstractBaseUser
from django.db.models import QuerySet
from documents.models import Document
logger = logging.getLogger("paperless.search")
_WORD_RE = regex.compile(r"\w+")
_AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted content
T = TypeVar("T")
class SearchMode(StrEnum):
QUERY = "query"
TEXT = "text"
TITLE = "title"
def _extract_autocomplete_words(text_sources: list[str]) -> set[str]:
"""Extract and normalize words for autocomplete.
Splits on non-word characters (matching Tantivy's simple tokenizer), lowercases,
and ascii-folds each token. Uses the regex library with a timeout to guard against
ReDoS on untrusted document content.
"""
words = set()
for text in text_sources:
if not text:
continue
try:
tokens = _WORD_RE.findall(text, timeout=_AUTOCOMPLETE_REGEX_TIMEOUT)
except TimeoutError: # pragma: no cover
logger.warning(
"Autocomplete word extraction timed out for a text source; skipping.",
)
continue
for token in tokens:
normalized = ascii_fold(token.lower())
if normalized:
words.add(normalized)
return words
class SearchHit(TypedDict):
"""Type definition for search result hits."""
id: int
score: float
rank: int
highlights: dict[str, str]
@dataclass(frozen=True, slots=True)
class SearchResults:
"""
Container for search results with pagination metadata.
Attributes:
hits: List of search results with scores and highlights
total: Total matching documents across all pages (for pagination)
query: Preprocessed query string after date/syntax rewriting
"""
hits: list[SearchHit]
total: int # total matching documents (for pagination)
query: str # preprocessed query string
class TantivyRelevanceList:
"""
DRF-compatible list wrapper for Tantivy search results.
Holds a lightweight ordered list of IDs (for pagination count and
``selection_data``) together with a small page of rich ``SearchHit``
dicts (for serialization). DRF's ``PageNumberPagination`` calls
``__len__`` to compute the total page count and ``__getitem__`` to
slice the displayed page.
Args:
ordered_ids: All matching document IDs in display order.
page_hits: Rich SearchHit dicts for the requested DRF page only.
page_offset: Index into *ordered_ids* where *page_hits* starts.
"""
def __init__(
self,
ordered_ids: list[int],
page_hits: list[SearchHit],
page_offset: int = 0,
) -> None:
self._ordered_ids = ordered_ids
self._page_hits = page_hits
self._page_offset = page_offset
def __len__(self) -> int:
return len(self._ordered_ids)
def __getitem__(self, key: int | slice) -> SearchHit | list[SearchHit]:
if isinstance(key, int):
idx = key if key >= 0 else len(self._ordered_ids) + key
if self._page_offset <= idx < self._page_offset + len(self._page_hits):
return self._page_hits[idx - self._page_offset]
return SearchHit(
id=self._ordered_ids[key],
score=0.0,
rank=idx + 1,
highlights={},
)
start = key.start or 0
stop = key.stop or len(self._ordered_ids)
# DRF slices to extract the current page. If the slice aligns
# with our pre-fetched page_hits, return them directly.
if start == self._page_offset and stop <= self._page_offset + len(
self._page_hits,
):
return self._page_hits[: stop - start]
# Fallback: return stub dicts (no highlights).
return [
SearchHit(id=doc_id, score=0.0, rank=start + i + 1, highlights={})
for i, doc_id in enumerate(self._ordered_ids[key])
]
def get_all_ids(self) -> list[int]:
"""Return all matching document IDs in display order."""
return self._ordered_ids
class SearchIndexLockError(Exception):
"""Raised when the search index file lock cannot be acquired within the timeout."""
class WriteBatch:
"""
Context manager for bulk index operations with file locking.
Provides transactional batch updates to the search index with proper
concurrency control via file locking. All operations within the batch
are committed atomically or rolled back on exception.
Usage:
with backend.batch_update() as batch:
batch.add_or_update(document)
batch.remove(doc_id)
"""
def __init__(self, backend: TantivyBackend, lock_timeout: float):
self._backend = backend
self._lock_timeout = lock_timeout
self._writer = None
self._lock = None
def __enter__(self) -> Self:
if self._backend._path is not None:
lock_path = self._backend._path / ".tantivy.lock"
self._lock = filelock.FileLock(str(lock_path))
try:
self._lock.acquire(timeout=self._lock_timeout)
except filelock.Timeout as e: # pragma: no cover
raise SearchIndexLockError(
f"Could not acquire index lock within {self._lock_timeout}s",
) from e
self._writer = self._backend._index.writer()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
try:
if exc_type is None:
self._writer.commit()
self._backend._index.reload()
# Explicitly delete writer to release tantivy's internal lock.
# On exception the uncommitted writer is simply discarded.
if self._writer is not None:
del self._writer
self._writer = None
finally:
if self._lock is not None:
self._lock.release()
def add_or_update(
self,
document: Document,
effective_content: str | None = None,
) -> None:
"""
Add or update a document in the batch.
Implements upsert behavior by deleting any existing document with the same ID
and adding the new version. This ensures stale document data (e.g., after
permission changes) doesn't persist in the index.
Args:
document: Django Document instance to index
effective_content: Override document.content for indexing (used when
re-indexing with newer OCR text from document versions)
"""
self.remove(document.pk)
doc = self._backend._build_tantivy_doc(document, effective_content)
self._writer.add_document(doc)
def remove(self, doc_id: int) -> None:
"""
Remove a document from the batch by its primary key.
Uses range query instead of term query to work around unsigned integer
type detection bug in tantivy-py 0.25.
"""
# Use range query to work around u64 deletion bug
self._writer.delete_documents_by_query(
tantivy.Query.range_query(
self._backend._schema,
"id",
tantivy.FieldType.Unsigned,
doc_id,
doc_id,
),
)
class TantivyBackend:
"""
Tantivy search backend with explicit lifecycle management.
Provides full-text search capabilities using the Tantivy search engine.
Supports in-memory indexes (for testing) and persistent on-disk indexes
(for production use). Handles document indexing, search queries, autocompletion,
and "more like this" functionality.
The backend manages its own connection lifecycle and can be reset when
the underlying index directory changes (e.g., during test isolation).
"""
# Maps DRF ordering field names to Tantivy index field names.
SORT_FIELD_MAP: dict[str, str] = {
"title": "title_sort",
"correspondent__name": "correspondent_sort",
"document_type__name": "type_sort",
"created": "created",
"added": "added",
"modified": "modified",
"archive_serial_number": "asn",
"page_count": "page_count",
"num_notes": "num_notes",
}
# Fields where Tantivy's sort order matches the ORM's sort order.
# Text-based fields (title, correspondent__name, document_type__name)
# are excluded because Tantivy's tokenized fast fields produce different
# ordering than the ORM's collation-based ordering.
SORTABLE_FIELDS: frozenset[str] = frozenset(
{
"created",
"added",
"modified",
"archive_serial_number",
"page_count",
"num_notes",
},
)
def __init__(self, path: Path | None = None):
# path=None → in-memory index (for tests)
# path=some_dir → on-disk index (for production)
self._path = path
self._index = None
self._schema = None
def open(self) -> None:
"""
Open or rebuild the index as needed.
For disk-based indexes, checks if rebuilding is needed due to schema
version or language changes. Registers custom tokenizers after opening.
Safe to call multiple times - subsequent calls are no-ops.
"""
if self._index is not None:
return # pragma: no cover
if self._path is not None:
self._index = open_or_rebuild_index(self._path)
else:
self._index = tantivy.Index(build_schema())
register_tokenizers(self._index, settings.SEARCH_LANGUAGE)
self._schema = self._index.schema
def close(self) -> None:
"""
Close the index and release resources.
Safe to call multiple times - subsequent calls are no-ops.
"""
self._index = None
self._schema = None
def _ensure_open(self) -> None:
"""Ensure the index is open before operations."""
if self._index is None:
self.open() # pragma: no cover
def _parse_query(
self,
query: str,
search_mode: SearchMode,
) -> tantivy.Query:
"""Parse a user query string into a Tantivy Query object."""
tz = get_current_timezone()
if search_mode is SearchMode.TEXT:
return parse_simple_text_query(self._index, query)
elif search_mode is SearchMode.TITLE:
return parse_simple_title_query(self._index, query)
else:
return parse_user_query(self._index, query, tz)
def _apply_permission_filter(
self,
query: tantivy.Query,
user: AbstractBaseUser | None,
) -> tantivy.Query:
"""Wrap a query with a permission filter if the user is not a superuser."""
if user is not None:
permission_filter = build_permission_filter(self._schema, user)
return tantivy.Query.boolean_query(
[
(tantivy.Occur.Must, query),
(tantivy.Occur.Must, permission_filter),
],
)
return query
def _build_tantivy_doc(
self,
document: Document,
effective_content: str | None = None,
) -> tantivy.Document:
"""Build a tantivy Document from a Django Document instance.
``effective_content`` overrides ``document.content`` for indexing —
used when re-indexing a root document with a newer version's OCR text.
"""
content = (
effective_content if effective_content is not None else document.content
)
doc = tantivy.Document()
# Basic fields
doc.add_unsigned("id", document.pk)
doc.add_text("checksum", document.checksum)
doc.add_text("title", document.title)
doc.add_text("title_sort", document.title)
doc.add_text("simple_title", document.title)
doc.add_text("content", content)
doc.add_text("bigram_content", content)
doc.add_text("simple_content", content)
# Original filename - only add if not None/empty
if document.original_filename:
doc.add_text("original_filename", document.original_filename)
# Correspondent
if document.correspondent:
doc.add_text("correspondent", document.correspondent.name)
doc.add_text("correspondent_sort", document.correspondent.name)
doc.add_unsigned("correspondent_id", document.correspondent_id)
# Document type
if document.document_type:
doc.add_text("document_type", document.document_type.name)
doc.add_text("type_sort", document.document_type.name)
doc.add_unsigned("document_type_id", document.document_type_id)
# Storage path
if document.storage_path:
doc.add_text("storage_path", document.storage_path.name)
doc.add_unsigned("storage_path_id", document.storage_path_id)
# Tags — collect names for autocomplete in the same pass
tag_names: list[str] = []
for tag in document.tags.all():
doc.add_text("tag", tag.name)
doc.add_unsigned("tag_id", tag.pk)
tag_names.append(tag.name)
# Notes — JSON for structured queries (notes.user:alice, notes.note:text),
# companion text field for default full-text search.
num_notes = 0
for note in document.notes.all():
num_notes += 1
doc.add_json("notes", {"note": note.note, "user": note.user.username})
# Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y),
# companion text field for default full-text search.
for cfi in document.custom_fields.all():
search_value = cfi.value_for_search
# Skip fields where there is no value yet
if search_value is None:
continue
doc.add_json(
"custom_fields",
{
"name": cfi.field.name,
"value": search_value,
},
)
# Dates
created_date = datetime(
document.created.year,
document.created.month,
document.created.day,
tzinfo=UTC,
)
doc.add_date("created", created_date)
doc.add_date("modified", document.modified)
doc.add_date("added", document.added)
if document.archive_serial_number is not None:
doc.add_unsigned("asn", document.archive_serial_number)
if document.page_count is not None:
doc.add_unsigned("page_count", document.page_count)
doc.add_unsigned("num_notes", num_notes)
# Owner
if document.owner_id:
doc.add_unsigned("owner_id", document.owner_id)
# Viewers with permission
users_with_perms = get_users_with_perms(
document,
only_with_perms_in=["view_document"],
)
for user in users_with_perms:
doc.add_unsigned("viewer_id", user.pk)
# Autocomplete words
text_sources = [document.title, content]
if document.correspondent:
text_sources.append(document.correspondent.name)
if document.document_type:
text_sources.append(document.document_type.name)
text_sources.extend(tag_names)
for word in sorted(_extract_autocomplete_words(text_sources)):
doc.add_text("autocomplete_word", word)
return doc
def add_or_update(
self,
document: Document,
effective_content: str | None = None,
) -> None:
"""
Add or update a single document with file locking.
Convenience method for single-document updates. For bulk operations,
use batch_update() context manager for better performance.
Args:
document: Django Document instance to index
effective_content: Override document.content for indexing
"""
self._ensure_open()
with self.batch_update(lock_timeout=5.0) as batch:
batch.add_or_update(document, effective_content)
def remove(self, doc_id: int) -> None:
"""
Remove a single document from the index with file locking.
Convenience method for single-document removal. For bulk operations,
use batch_update() context manager for better performance.
Args:
doc_id: Primary key of the document to remove
"""
self._ensure_open()
with self.batch_update(lock_timeout=5.0) as batch:
batch.remove(doc_id)
def search(
self,
query: str,
user: AbstractBaseUser | None,
page: int,
page_size: int,
sort_field: str | None,
*,
sort_reverse: bool,
search_mode: SearchMode = SearchMode.QUERY,
highlight_page: int | None = None,
highlight_page_size: int | None = None,
) -> SearchResults:
"""
Execute a search query against the document index.
Processes the user query through date rewriting, normalization, and
permission filtering before executing against Tantivy. Supports both
relevance-based and field-based sorting.
QUERY search mode supports natural date keywords, field filters, etc.
TITLE search mode treats the query as plain text to search for in title only
TEXT search mode treats the query as plain text to search for in title and content
Args:
query: User's search query
user: User for permission filtering (None for superuser/no filtering)
page: Page number (1-indexed) for pagination
page_size: Number of results per page
sort_field: Field to sort by (None for relevance ranking)
sort_reverse: Whether to reverse the sort order
search_mode: "query" for advanced Tantivy syntax, "text" for
plain-text search over title and content only, "title" for
plain-text search over title only
Returns:
SearchResults with hits, total count, and processed query
"""
self._ensure_open()
user_query = self._parse_query(query, search_mode)
final_query = self._apply_permission_filter(user_query, user)
searcher = self._index.searcher()
offset = (page - 1) * page_size
# Perform search
if sort_field and sort_field in self.SORT_FIELD_MAP:
mapped_field = self.SORT_FIELD_MAP[sort_field]
results = searcher.search(
final_query,
limit=offset + page_size,
order_by_field=mapped_field,
order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
)
# Field sorting: hits are still (score, DocAddress) tuples; score unused
all_hits = [(hit[1], 0.0) for hit in results.hits]
else:
# Score-based search: hits are (score, DocAddress) tuples
results = searcher.search(final_query, limit=offset + page_size)
all_hits = [(hit[1], hit[0]) for hit in results.hits]
total = results.count
# Normalize scores for score-based searches
if not sort_field and all_hits:
max_score = max(hit[1] for hit in all_hits) or 1.0
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
# Apply threshold filter if configured (score-based search only)
threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
if threshold is not None and not sort_field:
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
# Get the page's hits
page_hits = all_hits[offset : offset + page_size]
# Build result hits with highlights
hits: list[SearchHit] = []
snippet_generator = None
notes_snippet_generator = None
# Determine which hits need highlights
if highlight_page is not None and highlight_page_size is not None:
hl_start = (highlight_page - 1) * highlight_page_size
hl_end = hl_start + highlight_page_size
else:
# Highlight all hits (backward-compatible default)
hl_start = 0
hl_end = len(page_hits)
for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
# Get the actual document from the searcher using the doc address
actual_doc = searcher.doc(doc_address)
doc_dict = actual_doc.to_dict()
doc_id = doc_dict["id"][0]
highlights: dict[str, str] = {}
# Generate highlights if score > 0 and hit is in the highlight window
hit_index = rank - offset - 1 # 0-based index within page_hits
if score > 0 and hl_start <= hit_index < hl_end:
try:
if snippet_generator is None:
snippet_generator = tantivy.SnippetGenerator.create(
searcher,
final_query,
self._schema,
"content",
)
content_snippet = snippet_generator.snippet_from_doc(actual_doc)
if content_snippet:
highlights["content"] = str(content_snippet)
# Try notes highlights
if "notes" in doc_dict:
if notes_snippet_generator is None:
notes_snippet_generator = tantivy.SnippetGenerator.create(
searcher,
final_query,
self._schema,
"notes",
)
notes_snippet = notes_snippet_generator.snippet_from_doc(
actual_doc,
)
if notes_snippet:
highlights["notes"] = str(notes_snippet)
except Exception: # pragma: no cover
logger.debug("Failed to generate highlights for doc %s", doc_id)
hits.append(
SearchHit(
id=doc_id,
score=score,
rank=rank,
highlights=highlights,
),
)
return SearchResults(
hits=hits,
total=total,
query=query,
)
def highlight_hits(
self,
query: str,
doc_ids: list[int],
*,
search_mode: SearchMode = SearchMode.QUERY,
) -> list[SearchHit]:
"""
Generate SearchHit dicts with highlights for specific document IDs.
Unlike search(), this does not execute a ranked query — it looks up
each document by ID and generates snippets against the provided query.
Use this when you already know which documents to display (from
search_ids + ORM filtering) and just need highlight data.
Note: Each doc_id requires an individual index lookup because tantivy-py
does not expose a batch doc-address-by-ID API. This is acceptable for
page-sized batches (typically 25 docs) but should not be called with
thousands of IDs.
Args:
query: The search query (used for snippet generation)
doc_ids: Ordered list of document IDs to generate hits for
search_mode: Query parsing mode (for building the snippet query)
Returns:
List of SearchHit dicts in the same order as doc_ids
"""
if not doc_ids:
return []
self._ensure_open()
user_query = self._parse_query(query, search_mode)
searcher = self._index.searcher()
snippet_generator = None
hits: list[SearchHit] = []
for rank, doc_id in enumerate(doc_ids, start=1):
# Look up document by ID
id_query = tantivy.Query.range_query(
self._schema,
"id",
tantivy.FieldType.Unsigned,
doc_id,
doc_id,
)
results = searcher.search(id_query, limit=1)
if not results.hits:
continue
doc_address = results.hits[0][1]
actual_doc = searcher.doc(doc_address)
doc_dict = actual_doc.to_dict()
highlights: dict[str, str] = {}
try:
if snippet_generator is None:
snippet_generator = tantivy.SnippetGenerator.create(
searcher,
user_query,
self._schema,
"content",
)
content_snippet = snippet_generator.snippet_from_doc(actual_doc)
if content_snippet:
highlights["content"] = str(content_snippet)
if "notes" in doc_dict:
notes_generator = tantivy.SnippetGenerator.create(
searcher,
user_query,
self._schema,
"notes",
)
notes_snippet = notes_generator.snippet_from_doc(actual_doc)
if notes_snippet:
highlights["notes"] = str(notes_snippet)
except Exception: # pragma: no cover
logger.debug("Failed to generate highlights for doc %s", doc_id)
hits.append(
SearchHit(
id=doc_id,
score=0.0,
rank=rank,
highlights=highlights,
),
)
return hits
def search_ids(
self,
query: str,
user: AbstractBaseUser | None,
*,
sort_field: str | None = None,
sort_reverse: bool = False,
search_mode: SearchMode = SearchMode.QUERY,
limit: int | None = None,
) -> list[int]:
"""
Return document IDs matching a query — no highlights or scores.
This is the lightweight companion to search(). Use it when you need the
full set of matching IDs (e.g. for ``selection_data``) but don't need
scores, ranks, or highlights.
Args:
query: User's search query
user: User for permission filtering (None for superuser/no filtering)
sort_field: Field to sort by (None for relevance ranking)
sort_reverse: Whether to reverse the sort order
search_mode: Query parsing mode (QUERY, TEXT, or TITLE)
limit: Maximum number of IDs to return (None = all matching docs)
Returns:
List of document IDs in the requested order
"""
self._ensure_open()
user_query = self._parse_query(query, search_mode)
final_query = self._apply_permission_filter(user_query, user)
searcher = self._index.searcher()
effective_limit = limit if limit is not None else searcher.num_docs
if sort_field and sort_field in self.SORT_FIELD_MAP:
mapped_field = self.SORT_FIELD_MAP[sort_field]
results = searcher.search(
final_query,
limit=effective_limit,
order_by_field=mapped_field,
order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
)
all_hits = [(hit[1],) for hit in results.hits]
else:
results = searcher.search(final_query, limit=effective_limit)
all_hits = [(hit[1], hit[0]) for hit in results.hits]
# Normalize scores and apply threshold (relevance search only)
if all_hits:
max_score = max(hit[1] for hit in all_hits) or 1.0
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
if threshold is not None:
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]
def autocomplete(
self,
term: str,
limit: int,
user: AbstractBaseUser | None = None,
) -> list[str]:
"""
Get autocomplete suggestions for search queries.
Returns words that start with the given term prefix, ranked by document
frequency (how many documents contain each word). Optionally filters
results to only words from documents visible to the specified user.
Args:
term: Prefix to match against autocomplete words
limit: Maximum number of suggestions to return
user: User for permission filtering (None for no filtering)
Returns:
List of word suggestions ordered by frequency, then alphabetically
"""
self._ensure_open()
normalized_term = ascii_fold(term.lower())
searcher = self._index.searcher()
# Apply permission filter for non-superusers so autocomplete words
# from invisible documents don't leak to other users.
if user is not None and not user.is_superuser:
base_query = build_permission_filter(self._schema, user)
else:
base_query = tantivy.Query.all_query()
results = searcher.search(base_query, limit=searcher.num_docs)
# Count how many visible documents each word appears in.
# Using Counter (not set) preserves per-word document frequency so
# we can rank suggestions by how commonly they occur — the same
# signal Whoosh used for Tf/Idf-based autocomplete ordering.
word_counts: Counter[str] = Counter()
for _score, doc_address in results.hits:
stored_doc = searcher.doc(doc_address)
doc_dict = stored_doc.to_dict()
if "autocomplete_word" in doc_dict:
word_counts.update(doc_dict["autocomplete_word"])
# Filter to prefix matches, sort by document frequency descending;
# break ties alphabetically for stable, deterministic output.
matches = sorted(
(w for w in word_counts if w.startswith(normalized_term)),
key=lambda w: (-word_counts[w], w),
)
return matches[:limit]
def more_like_this(
self,
doc_id: int,
user: AbstractBaseUser | None,
page: int,
page_size: int,
) -> SearchResults:
"""
Find documents similar to the given document using content analysis.
Uses Tantivy's "more like this" query to find documents with similar
content patterns. The original document is excluded from results.
Args:
doc_id: Primary key of the reference document
user: User for permission filtering (None for no filtering)
page: Page number (1-indexed) for pagination
page_size: Number of results per page
Returns:
SearchResults with similar documents (excluding the original)
"""
self._ensure_open()
searcher = self._index.searcher()
# First find the document address
id_query = tantivy.Query.range_query(
self._schema,
"id",
tantivy.FieldType.Unsigned,
doc_id,
doc_id,
)
results = searcher.search(id_query, limit=1)
if not results.hits:
# Document not found
return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}")
# Extract doc_address from (score, doc_address) tuple
doc_address = results.hits[0][1]
# Build more like this query
mlt_query = tantivy.Query.more_like_this_query(
doc_address,
min_doc_frequency=1,
max_doc_frequency=None,
min_term_frequency=1,
max_query_terms=12,
min_word_length=None,
max_word_length=None,
boost_factor=None,
)
final_query = self._apply_permission_filter(mlt_query, user)
# Search
offset = (page - 1) * page_size
results = searcher.search(final_query, limit=offset + page_size)
total = results.count
# Convert from (score, doc_address) to (doc_address, score)
all_hits = [(hit[1], hit[0]) for hit in results.hits]
# Normalize scores
if all_hits:
max_score = max(hit[1] for hit in all_hits) or 1.0
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
# Get page hits
page_hits = all_hits[offset : offset + page_size]
# Build results
hits: list[SearchHit] = []
for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
actual_doc = searcher.doc(doc_address)
doc_dict = actual_doc.to_dict()
result_doc_id = doc_dict["id"][0]
# Skip the original document
if result_doc_id == doc_id:
continue
hits.append(
SearchHit(
id=result_doc_id,
score=score,
rank=rank,
highlights={}, # MLT doesn't generate highlights
),
)
return SearchResults(
hits=hits,
total=max(0, total - 1), # Subtract 1 for the original document
query=f"more_like:{doc_id}",
)
def more_like_this_ids(
self,
doc_id: int,
user: AbstractBaseUser | None,
*,
limit: int | None = None,
) -> list[int]:
"""
Return IDs of documents similar to the given document — no highlights.
Lightweight companion to more_like_this(). The original document is
excluded from results.
Args:
doc_id: Primary key of the reference document
user: User for permission filtering (None for no filtering)
limit: Maximum number of IDs to return (None = all matching docs)
Returns:
List of similar document IDs (excluding the original)
"""
self._ensure_open()
searcher = self._index.searcher()
id_query = tantivy.Query.range_query(
self._schema,
"id",
tantivy.FieldType.Unsigned,
doc_id,
doc_id,
)
results = searcher.search(id_query, limit=1)
if not results.hits:
return []
doc_address = results.hits[0][1]
mlt_query = tantivy.Query.more_like_this_query(
doc_address,
min_doc_frequency=1,
max_doc_frequency=None,
min_term_frequency=1,
max_query_terms=12,
min_word_length=None,
max_word_length=None,
boost_factor=None,
)
final_query = self._apply_permission_filter(mlt_query, user)
effective_limit = limit if limit is not None else searcher.num_docs
results = searcher.search(final_query, limit=effective_limit)
ids = []
for _score, doc_address in results.hits:
result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
if result_doc_id != doc_id:
ids.append(result_doc_id)
return ids
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
"""
Get a batch context manager for bulk index operations.
Use this for efficient bulk document updates/deletions. All operations
within the batch are committed atomically at the end of the context.
Args:
lock_timeout: Seconds to wait for file lock acquisition
Returns:
WriteBatch context manager
Raises:
SearchIndexLockError: If lock cannot be acquired within timeout
"""
self._ensure_open()
return WriteBatch(self, lock_timeout)
def rebuild(
self,
documents: QuerySet[Document],
iter_wrapper: IterWrapper[Document] = identity,
) -> None:
"""
Rebuild the entire search index from scratch.
Wipes the existing index and re-indexes all provided documents.
On failure, restores the previous index state to keep the backend usable.
Args:
documents: QuerySet of Document instances to index
iter_wrapper: Optional wrapper function for progress tracking
(e.g., progress bar). Should yield each document unchanged.
"""
# Create new index (on-disk or in-memory)
if self._path is not None:
wipe_index(self._path)
new_index = tantivy.Index(build_schema(), path=str(self._path))
_write_sentinels(self._path)
else:
new_index = tantivy.Index(build_schema())
register_tokenizers(new_index, settings.SEARCH_LANGUAGE)
# Point instance at the new index so _build_tantivy_doc uses it
old_index, old_schema = self._index, self._schema
self._index = new_index
self._schema = new_index.schema
try:
writer = new_index.writer()
for document in iter_wrapper(documents):
doc = self._build_tantivy_doc(
document,
document.get_effective_content(),
)
writer.add_document(doc)
writer.commit()
new_index.reload()
except BaseException: # pragma: no cover
# Restore old index on failure so the backend remains usable
self._index = old_index
self._schema = old_schema
raise
# Module-level singleton with proper thread safety
_backend: TantivyBackend | None = None
_backend_path: Path | None = None # tracks which INDEX_DIR the singleton uses
_backend_lock = threading.RLock()
def get_backend() -> TantivyBackend:
"""
Get the global backend instance with thread safety.
Returns a singleton TantivyBackend instance, automatically reinitializing
when settings.INDEX_DIR changes. This ensures proper test isolation when
using pytest-xdist or @override_settings that change the index directory.
Returns:
Thread-safe singleton TantivyBackend instance
"""
global _backend, _backend_path
current_path: Path = settings.INDEX_DIR
# Fast path: backend is initialized and path hasn't changed (no lock needed)
if _backend is not None and _backend_path == current_path:
return _backend
# Slow path: first call, or INDEX_DIR changed between calls
with _backend_lock:
# Double-check after acquiring lock — another thread may have beaten us
if _backend is not None and _backend_path == current_path:
return _backend # pragma: no cover
if _backend is not None:
_backend.close()
_backend = TantivyBackend(path=current_path)
_backend.open()
_backend_path = current_path
return _backend
def reset_backend() -> None:
"""
Reset the global backend instance with thread safety.
Forces creation of a new backend instance on the next get_backend() call.
Used for test isolation and when switching between different index directories.
"""
global _backend, _backend_path
with _backend_lock:
if _backend is not None:
_backend.close()
_backend = None
_backend_path = None