mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-06 16:18:51 +00:00
Deduplicates query parsing (3 call sites) and permission filter wrapping (4 call sites) into private helper methods on TantivyBackend. Also documents the N-lookup limitation of highlight_hits(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1155 lines
40 KiB
Python
1155 lines
40 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
import threading
|
|
from collections import Counter
|
|
from dataclasses import dataclass
|
|
from datetime import UTC
|
|
from datetime import datetime
|
|
from enum import StrEnum
|
|
from typing import TYPE_CHECKING
|
|
from typing import Self
|
|
from typing import TypedDict
|
|
from typing import TypeVar
|
|
|
|
import filelock
|
|
import regex
|
|
import tantivy
|
|
from django.conf import settings
|
|
from django.utils.timezone import get_current_timezone
|
|
from guardian.shortcuts import get_users_with_perms
|
|
|
|
from documents.search._normalize import ascii_fold
|
|
from documents.search._query import build_permission_filter
|
|
from documents.search._query import parse_simple_text_query
|
|
from documents.search._query import parse_simple_title_query
|
|
from documents.search._query import parse_user_query
|
|
from documents.search._schema import _write_sentinels
|
|
from documents.search._schema import build_schema
|
|
from documents.search._schema import open_or_rebuild_index
|
|
from documents.search._schema import wipe_index
|
|
from documents.search._tokenizer import register_tokenizers
|
|
from documents.utils import IterWrapper
|
|
from documents.utils import identity
|
|
|
|
if TYPE_CHECKING:
|
|
from pathlib import Path
|
|
|
|
from django.contrib.auth.base_user import AbstractBaseUser
|
|
from django.db.models import QuerySet
|
|
|
|
from documents.models import Document
|
|
|
|
logger = logging.getLogger("paperless.search")
|
|
|
|
_WORD_RE = regex.compile(r"\w+")
|
|
_AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted content
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
|
class SearchMode(StrEnum):
|
|
QUERY = "query"
|
|
TEXT = "text"
|
|
TITLE = "title"
|
|
|
|
|
|
def _extract_autocomplete_words(text_sources: list[str]) -> set[str]:
|
|
"""Extract and normalize words for autocomplete.
|
|
|
|
Splits on non-word characters (matching Tantivy's simple tokenizer), lowercases,
|
|
and ascii-folds each token. Uses the regex library with a timeout to guard against
|
|
ReDoS on untrusted document content.
|
|
"""
|
|
words = set()
|
|
for text in text_sources:
|
|
if not text:
|
|
continue
|
|
try:
|
|
tokens = _WORD_RE.findall(text, timeout=_AUTOCOMPLETE_REGEX_TIMEOUT)
|
|
except TimeoutError: # pragma: no cover
|
|
logger.warning(
|
|
"Autocomplete word extraction timed out for a text source; skipping.",
|
|
)
|
|
continue
|
|
for token in tokens:
|
|
normalized = ascii_fold(token.lower())
|
|
if normalized:
|
|
words.add(normalized)
|
|
return words
|
|
|
|
|
|
class SearchHit(TypedDict):
|
|
"""Type definition for search result hits."""
|
|
|
|
id: int
|
|
score: float
|
|
rank: int
|
|
highlights: dict[str, str]
|
|
|
|
|
|
@dataclass(frozen=True, slots=True)
|
|
class SearchResults:
|
|
"""
|
|
Container for search results with pagination metadata.
|
|
|
|
Attributes:
|
|
hits: List of search results with scores and highlights
|
|
total: Total matching documents across all pages (for pagination)
|
|
query: Preprocessed query string after date/syntax rewriting
|
|
"""
|
|
|
|
hits: list[SearchHit]
|
|
total: int # total matching documents (for pagination)
|
|
query: str # preprocessed query string
|
|
|
|
|
|
class TantivyRelevanceList:
|
|
"""
|
|
DRF-compatible list wrapper for Tantivy search results.
|
|
|
|
Holds a lightweight ordered list of IDs (for pagination count and
|
|
``selection_data``) together with a small page of rich ``SearchHit``
|
|
dicts (for serialization). DRF's ``PageNumberPagination`` calls
|
|
``__len__`` to compute the total page count and ``__getitem__`` to
|
|
slice the displayed page.
|
|
|
|
Args:
|
|
ordered_ids: All matching document IDs in display order.
|
|
page_hits: Rich SearchHit dicts for the requested DRF page only.
|
|
page_offset: Index into *ordered_ids* where *page_hits* starts.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
ordered_ids: list[int],
|
|
page_hits: list[SearchHit],
|
|
page_offset: int = 0,
|
|
) -> None:
|
|
self._ordered_ids = ordered_ids
|
|
self._page_hits = page_hits
|
|
self._page_offset = page_offset
|
|
|
|
def __len__(self) -> int:
|
|
return len(self._ordered_ids)
|
|
|
|
def __getitem__(self, key: int | slice) -> SearchHit | list[SearchHit]:
|
|
if isinstance(key, int):
|
|
idx = key if key >= 0 else len(self._ordered_ids) + key
|
|
if self._page_offset <= idx < self._page_offset + len(self._page_hits):
|
|
return self._page_hits[idx - self._page_offset]
|
|
return SearchHit(
|
|
id=self._ordered_ids[key],
|
|
score=0.0,
|
|
rank=idx + 1,
|
|
highlights={},
|
|
)
|
|
start = key.start or 0
|
|
stop = key.stop or len(self._ordered_ids)
|
|
# DRF slices to extract the current page. If the slice aligns
|
|
# with our pre-fetched page_hits, return them directly.
|
|
if start == self._page_offset and stop <= self._page_offset + len(
|
|
self._page_hits,
|
|
):
|
|
return self._page_hits[: stop - start]
|
|
# Fallback: return stub dicts (no highlights).
|
|
return [
|
|
SearchHit(id=doc_id, score=0.0, rank=start + i + 1, highlights={})
|
|
for i, doc_id in enumerate(self._ordered_ids[key])
|
|
]
|
|
|
|
def get_all_ids(self) -> list[int]:
|
|
"""Return all matching document IDs in display order."""
|
|
return self._ordered_ids
|
|
|
|
|
|
class SearchIndexLockError(Exception):
|
|
"""Raised when the search index file lock cannot be acquired within the timeout."""
|
|
|
|
|
|
class WriteBatch:
|
|
"""
|
|
Context manager for bulk index operations with file locking.
|
|
|
|
Provides transactional batch updates to the search index with proper
|
|
concurrency control via file locking. All operations within the batch
|
|
are committed atomically or rolled back on exception.
|
|
|
|
Usage:
|
|
with backend.batch_update() as batch:
|
|
batch.add_or_update(document)
|
|
batch.remove(doc_id)
|
|
"""
|
|
|
|
def __init__(self, backend: TantivyBackend, lock_timeout: float):
|
|
self._backend = backend
|
|
self._lock_timeout = lock_timeout
|
|
self._writer = None
|
|
self._lock = None
|
|
|
|
def __enter__(self) -> Self:
|
|
if self._backend._path is not None:
|
|
lock_path = self._backend._path / ".tantivy.lock"
|
|
self._lock = filelock.FileLock(str(lock_path))
|
|
try:
|
|
self._lock.acquire(timeout=self._lock_timeout)
|
|
except filelock.Timeout as e: # pragma: no cover
|
|
raise SearchIndexLockError(
|
|
f"Could not acquire index lock within {self._lock_timeout}s",
|
|
) from e
|
|
|
|
self._writer = self._backend._index.writer()
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
try:
|
|
if exc_type is None:
|
|
self._writer.commit()
|
|
self._backend._index.reload()
|
|
# Explicitly delete writer to release tantivy's internal lock.
|
|
# On exception the uncommitted writer is simply discarded.
|
|
if self._writer is not None:
|
|
del self._writer
|
|
self._writer = None
|
|
finally:
|
|
if self._lock is not None:
|
|
self._lock.release()
|
|
|
|
def add_or_update(
|
|
self,
|
|
document: Document,
|
|
effective_content: str | None = None,
|
|
) -> None:
|
|
"""
|
|
Add or update a document in the batch.
|
|
|
|
Implements upsert behavior by deleting any existing document with the same ID
|
|
and adding the new version. This ensures stale document data (e.g., after
|
|
permission changes) doesn't persist in the index.
|
|
|
|
Args:
|
|
document: Django Document instance to index
|
|
effective_content: Override document.content for indexing (used when
|
|
re-indexing with newer OCR text from document versions)
|
|
"""
|
|
self.remove(document.pk)
|
|
doc = self._backend._build_tantivy_doc(document, effective_content)
|
|
self._writer.add_document(doc)
|
|
|
|
def remove(self, doc_id: int) -> None:
|
|
"""
|
|
Remove a document from the batch by its primary key.
|
|
|
|
Uses range query instead of term query to work around unsigned integer
|
|
type detection bug in tantivy-py 0.25.
|
|
"""
|
|
# Use range query to work around u64 deletion bug
|
|
self._writer.delete_documents_by_query(
|
|
tantivy.Query.range_query(
|
|
self._backend._schema,
|
|
"id",
|
|
tantivy.FieldType.Unsigned,
|
|
doc_id,
|
|
doc_id,
|
|
),
|
|
)
|
|
|
|
|
|
class TantivyBackend:
|
|
"""
|
|
Tantivy search backend with explicit lifecycle management.
|
|
|
|
Provides full-text search capabilities using the Tantivy search engine.
|
|
Supports in-memory indexes (for testing) and persistent on-disk indexes
|
|
(for production use). Handles document indexing, search queries, autocompletion,
|
|
and "more like this" functionality.
|
|
|
|
The backend manages its own connection lifecycle and can be reset when
|
|
the underlying index directory changes (e.g., during test isolation).
|
|
"""
|
|
|
|
# Maps DRF ordering field names to Tantivy index field names.
|
|
SORT_FIELD_MAP: dict[str, str] = {
|
|
"title": "title_sort",
|
|
"correspondent__name": "correspondent_sort",
|
|
"document_type__name": "type_sort",
|
|
"created": "created",
|
|
"added": "added",
|
|
"modified": "modified",
|
|
"archive_serial_number": "asn",
|
|
"page_count": "page_count",
|
|
"num_notes": "num_notes",
|
|
}
|
|
|
|
# Fields where Tantivy's sort order matches the ORM's sort order.
|
|
# Text-based fields (title, correspondent__name, document_type__name)
|
|
# are excluded because Tantivy's tokenized fast fields produce different
|
|
# ordering than the ORM's collation-based ordering.
|
|
SORTABLE_FIELDS: frozenset[str] = frozenset(
|
|
{
|
|
"created",
|
|
"added",
|
|
"modified",
|
|
"archive_serial_number",
|
|
"page_count",
|
|
"num_notes",
|
|
},
|
|
)
|
|
|
|
def __init__(self, path: Path | None = None):
|
|
# path=None → in-memory index (for tests)
|
|
# path=some_dir → on-disk index (for production)
|
|
self._path = path
|
|
self._index = None
|
|
self._schema = None
|
|
|
|
def open(self) -> None:
|
|
"""
|
|
Open or rebuild the index as needed.
|
|
|
|
For disk-based indexes, checks if rebuilding is needed due to schema
|
|
version or language changes. Registers custom tokenizers after opening.
|
|
Safe to call multiple times - subsequent calls are no-ops.
|
|
"""
|
|
if self._index is not None:
|
|
return # pragma: no cover
|
|
if self._path is not None:
|
|
self._index = open_or_rebuild_index(self._path)
|
|
else:
|
|
self._index = tantivy.Index(build_schema())
|
|
register_tokenizers(self._index, settings.SEARCH_LANGUAGE)
|
|
self._schema = self._index.schema
|
|
|
|
def close(self) -> None:
|
|
"""
|
|
Close the index and release resources.
|
|
|
|
Safe to call multiple times - subsequent calls are no-ops.
|
|
"""
|
|
self._index = None
|
|
self._schema = None
|
|
|
|
def _ensure_open(self) -> None:
|
|
"""Ensure the index is open before operations."""
|
|
if self._index is None:
|
|
self.open() # pragma: no cover
|
|
|
|
def _parse_query(
|
|
self,
|
|
query: str,
|
|
search_mode: SearchMode,
|
|
) -> tantivy.Query:
|
|
"""Parse a user query string into a Tantivy Query object."""
|
|
tz = get_current_timezone()
|
|
if search_mode is SearchMode.TEXT:
|
|
return parse_simple_text_query(self._index, query)
|
|
elif search_mode is SearchMode.TITLE:
|
|
return parse_simple_title_query(self._index, query)
|
|
else:
|
|
return parse_user_query(self._index, query, tz)
|
|
|
|
def _apply_permission_filter(
|
|
self,
|
|
query: tantivy.Query,
|
|
user: AbstractBaseUser | None,
|
|
) -> tantivy.Query:
|
|
"""Wrap a query with a permission filter if the user is not a superuser."""
|
|
if user is not None:
|
|
permission_filter = build_permission_filter(self._schema, user)
|
|
return tantivy.Query.boolean_query(
|
|
[
|
|
(tantivy.Occur.Must, query),
|
|
(tantivy.Occur.Must, permission_filter),
|
|
],
|
|
)
|
|
return query
|
|
|
|
def _build_tantivy_doc(
|
|
self,
|
|
document: Document,
|
|
effective_content: str | None = None,
|
|
) -> tantivy.Document:
|
|
"""Build a tantivy Document from a Django Document instance.
|
|
|
|
``effective_content`` overrides ``document.content`` for indexing —
|
|
used when re-indexing a root document with a newer version's OCR text.
|
|
"""
|
|
content = (
|
|
effective_content if effective_content is not None else document.content
|
|
)
|
|
|
|
doc = tantivy.Document()
|
|
|
|
# Basic fields
|
|
doc.add_unsigned("id", document.pk)
|
|
doc.add_text("checksum", document.checksum)
|
|
doc.add_text("title", document.title)
|
|
doc.add_text("title_sort", document.title)
|
|
doc.add_text("simple_title", document.title)
|
|
doc.add_text("content", content)
|
|
doc.add_text("bigram_content", content)
|
|
doc.add_text("simple_content", content)
|
|
|
|
# Original filename - only add if not None/empty
|
|
if document.original_filename:
|
|
doc.add_text("original_filename", document.original_filename)
|
|
|
|
# Correspondent
|
|
if document.correspondent:
|
|
doc.add_text("correspondent", document.correspondent.name)
|
|
doc.add_text("correspondent_sort", document.correspondent.name)
|
|
doc.add_unsigned("correspondent_id", document.correspondent_id)
|
|
|
|
# Document type
|
|
if document.document_type:
|
|
doc.add_text("document_type", document.document_type.name)
|
|
doc.add_text("type_sort", document.document_type.name)
|
|
doc.add_unsigned("document_type_id", document.document_type_id)
|
|
|
|
# Storage path
|
|
if document.storage_path:
|
|
doc.add_text("storage_path", document.storage_path.name)
|
|
doc.add_unsigned("storage_path_id", document.storage_path_id)
|
|
|
|
# Tags — collect names for autocomplete in the same pass
|
|
tag_names: list[str] = []
|
|
for tag in document.tags.all():
|
|
doc.add_text("tag", tag.name)
|
|
doc.add_unsigned("tag_id", tag.pk)
|
|
tag_names.append(tag.name)
|
|
|
|
# Notes — JSON for structured queries (notes.user:alice, notes.note:text),
|
|
# companion text field for default full-text search.
|
|
num_notes = 0
|
|
for note in document.notes.all():
|
|
num_notes += 1
|
|
doc.add_json("notes", {"note": note.note, "user": note.user.username})
|
|
|
|
# Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y),
|
|
# companion text field for default full-text search.
|
|
for cfi in document.custom_fields.all():
|
|
search_value = cfi.value_for_search
|
|
# Skip fields where there is no value yet
|
|
if search_value is None:
|
|
continue
|
|
doc.add_json(
|
|
"custom_fields",
|
|
{
|
|
"name": cfi.field.name,
|
|
"value": search_value,
|
|
},
|
|
)
|
|
|
|
# Dates
|
|
created_date = datetime(
|
|
document.created.year,
|
|
document.created.month,
|
|
document.created.day,
|
|
tzinfo=UTC,
|
|
)
|
|
doc.add_date("created", created_date)
|
|
doc.add_date("modified", document.modified)
|
|
doc.add_date("added", document.added)
|
|
|
|
if document.archive_serial_number is not None:
|
|
doc.add_unsigned("asn", document.archive_serial_number)
|
|
|
|
if document.page_count is not None:
|
|
doc.add_unsigned("page_count", document.page_count)
|
|
|
|
doc.add_unsigned("num_notes", num_notes)
|
|
|
|
# Owner
|
|
if document.owner_id:
|
|
doc.add_unsigned("owner_id", document.owner_id)
|
|
|
|
# Viewers with permission
|
|
users_with_perms = get_users_with_perms(
|
|
document,
|
|
only_with_perms_in=["view_document"],
|
|
)
|
|
for user in users_with_perms:
|
|
doc.add_unsigned("viewer_id", user.pk)
|
|
|
|
# Autocomplete words
|
|
text_sources = [document.title, content]
|
|
if document.correspondent:
|
|
text_sources.append(document.correspondent.name)
|
|
if document.document_type:
|
|
text_sources.append(document.document_type.name)
|
|
text_sources.extend(tag_names)
|
|
|
|
for word in sorted(_extract_autocomplete_words(text_sources)):
|
|
doc.add_text("autocomplete_word", word)
|
|
|
|
return doc
|
|
|
|
def add_or_update(
|
|
self,
|
|
document: Document,
|
|
effective_content: str | None = None,
|
|
) -> None:
|
|
"""
|
|
Add or update a single document with file locking.
|
|
|
|
Convenience method for single-document updates. For bulk operations,
|
|
use batch_update() context manager for better performance.
|
|
|
|
Args:
|
|
document: Django Document instance to index
|
|
effective_content: Override document.content for indexing
|
|
"""
|
|
self._ensure_open()
|
|
with self.batch_update(lock_timeout=5.0) as batch:
|
|
batch.add_or_update(document, effective_content)
|
|
|
|
def remove(self, doc_id: int) -> None:
|
|
"""
|
|
Remove a single document from the index with file locking.
|
|
|
|
Convenience method for single-document removal. For bulk operations,
|
|
use batch_update() context manager for better performance.
|
|
|
|
Args:
|
|
doc_id: Primary key of the document to remove
|
|
"""
|
|
self._ensure_open()
|
|
with self.batch_update(lock_timeout=5.0) as batch:
|
|
batch.remove(doc_id)
|
|
|
|
def search(
|
|
self,
|
|
query: str,
|
|
user: AbstractBaseUser | None,
|
|
page: int,
|
|
page_size: int,
|
|
sort_field: str | None,
|
|
*,
|
|
sort_reverse: bool,
|
|
search_mode: SearchMode = SearchMode.QUERY,
|
|
highlight_page: int | None = None,
|
|
highlight_page_size: int | None = None,
|
|
) -> SearchResults:
|
|
"""
|
|
Execute a search query against the document index.
|
|
|
|
Processes the user query through date rewriting, normalization, and
|
|
permission filtering before executing against Tantivy. Supports both
|
|
relevance-based and field-based sorting.
|
|
|
|
QUERY search mode supports natural date keywords, field filters, etc.
|
|
TITLE search mode treats the query as plain text to search for in title only
|
|
TEXT search mode treats the query as plain text to search for in title and content
|
|
|
|
Args:
|
|
query: User's search query
|
|
user: User for permission filtering (None for superuser/no filtering)
|
|
page: Page number (1-indexed) for pagination
|
|
page_size: Number of results per page
|
|
sort_field: Field to sort by (None for relevance ranking)
|
|
sort_reverse: Whether to reverse the sort order
|
|
search_mode: "query" for advanced Tantivy syntax, "text" for
|
|
plain-text search over title and content only, "title" for
|
|
plain-text search over title only
|
|
|
|
Returns:
|
|
SearchResults with hits, total count, and processed query
|
|
"""
|
|
self._ensure_open()
|
|
user_query = self._parse_query(query, search_mode)
|
|
final_query = self._apply_permission_filter(user_query, user)
|
|
|
|
searcher = self._index.searcher()
|
|
offset = (page - 1) * page_size
|
|
|
|
# Perform search
|
|
if sort_field and sort_field in self.SORT_FIELD_MAP:
|
|
mapped_field = self.SORT_FIELD_MAP[sort_field]
|
|
results = searcher.search(
|
|
final_query,
|
|
limit=offset + page_size,
|
|
order_by_field=mapped_field,
|
|
order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
|
|
)
|
|
# Field sorting: hits are still (score, DocAddress) tuples; score unused
|
|
all_hits = [(hit[1], 0.0) for hit in results.hits]
|
|
else:
|
|
# Score-based search: hits are (score, DocAddress) tuples
|
|
results = searcher.search(final_query, limit=offset + page_size)
|
|
all_hits = [(hit[1], hit[0]) for hit in results.hits]
|
|
|
|
total = results.count
|
|
|
|
# Normalize scores for score-based searches
|
|
if not sort_field and all_hits:
|
|
max_score = max(hit[1] for hit in all_hits) or 1.0
|
|
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
|
|
|
|
# Apply threshold filter if configured (score-based search only)
|
|
threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
|
|
if threshold is not None and not sort_field:
|
|
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
|
|
|
|
# Get the page's hits
|
|
page_hits = all_hits[offset : offset + page_size]
|
|
|
|
# Build result hits with highlights
|
|
hits: list[SearchHit] = []
|
|
snippet_generator = None
|
|
notes_snippet_generator = None
|
|
|
|
# Determine which hits need highlights
|
|
if highlight_page is not None and highlight_page_size is not None:
|
|
hl_start = (highlight_page - 1) * highlight_page_size
|
|
hl_end = hl_start + highlight_page_size
|
|
else:
|
|
# Highlight all hits (backward-compatible default)
|
|
hl_start = 0
|
|
hl_end = len(page_hits)
|
|
|
|
for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
|
|
# Get the actual document from the searcher using the doc address
|
|
actual_doc = searcher.doc(doc_address)
|
|
doc_dict = actual_doc.to_dict()
|
|
doc_id = doc_dict["id"][0]
|
|
|
|
highlights: dict[str, str] = {}
|
|
|
|
# Generate highlights if score > 0 and hit is in the highlight window
|
|
hit_index = rank - offset - 1 # 0-based index within page_hits
|
|
if score > 0 and hl_start <= hit_index < hl_end:
|
|
try:
|
|
if snippet_generator is None:
|
|
snippet_generator = tantivy.SnippetGenerator.create(
|
|
searcher,
|
|
final_query,
|
|
self._schema,
|
|
"content",
|
|
)
|
|
|
|
content_snippet = snippet_generator.snippet_from_doc(actual_doc)
|
|
if content_snippet:
|
|
highlights["content"] = str(content_snippet)
|
|
|
|
# Try notes highlights
|
|
if "notes" in doc_dict:
|
|
if notes_snippet_generator is None:
|
|
notes_snippet_generator = tantivy.SnippetGenerator.create(
|
|
searcher,
|
|
final_query,
|
|
self._schema,
|
|
"notes",
|
|
)
|
|
notes_snippet = notes_snippet_generator.snippet_from_doc(
|
|
actual_doc,
|
|
)
|
|
if notes_snippet:
|
|
highlights["notes"] = str(notes_snippet)
|
|
|
|
except Exception: # pragma: no cover
|
|
logger.debug("Failed to generate highlights for doc %s", doc_id)
|
|
|
|
hits.append(
|
|
SearchHit(
|
|
id=doc_id,
|
|
score=score,
|
|
rank=rank,
|
|
highlights=highlights,
|
|
),
|
|
)
|
|
|
|
return SearchResults(
|
|
hits=hits,
|
|
total=total,
|
|
query=query,
|
|
)
|
|
|
|
def highlight_hits(
|
|
self,
|
|
query: str,
|
|
doc_ids: list[int],
|
|
*,
|
|
search_mode: SearchMode = SearchMode.QUERY,
|
|
) -> list[SearchHit]:
|
|
"""
|
|
Generate SearchHit dicts with highlights for specific document IDs.
|
|
|
|
Unlike search(), this does not execute a ranked query — it looks up
|
|
each document by ID and generates snippets against the provided query.
|
|
Use this when you already know which documents to display (from
|
|
search_ids + ORM filtering) and just need highlight data.
|
|
|
|
Note: Each doc_id requires an individual index lookup because tantivy-py
|
|
does not expose a batch doc-address-by-ID API. This is acceptable for
|
|
page-sized batches (typically 25 docs) but should not be called with
|
|
thousands of IDs.
|
|
|
|
Args:
|
|
query: The search query (used for snippet generation)
|
|
doc_ids: Ordered list of document IDs to generate hits for
|
|
search_mode: Query parsing mode (for building the snippet query)
|
|
|
|
Returns:
|
|
List of SearchHit dicts in the same order as doc_ids
|
|
"""
|
|
if not doc_ids:
|
|
return []
|
|
|
|
self._ensure_open()
|
|
user_query = self._parse_query(query, search_mode)
|
|
|
|
searcher = self._index.searcher()
|
|
snippet_generator = None
|
|
hits: list[SearchHit] = []
|
|
|
|
for rank, doc_id in enumerate(doc_ids, start=1):
|
|
# Look up document by ID
|
|
id_query = tantivy.Query.range_query(
|
|
self._schema,
|
|
"id",
|
|
tantivy.FieldType.Unsigned,
|
|
doc_id,
|
|
doc_id,
|
|
)
|
|
results = searcher.search(id_query, limit=1)
|
|
|
|
if not results.hits:
|
|
continue
|
|
|
|
doc_address = results.hits[0][1]
|
|
actual_doc = searcher.doc(doc_address)
|
|
doc_dict = actual_doc.to_dict()
|
|
|
|
highlights: dict[str, str] = {}
|
|
try:
|
|
if snippet_generator is None:
|
|
snippet_generator = tantivy.SnippetGenerator.create(
|
|
searcher,
|
|
user_query,
|
|
self._schema,
|
|
"content",
|
|
)
|
|
|
|
content_snippet = snippet_generator.snippet_from_doc(actual_doc)
|
|
if content_snippet:
|
|
highlights["content"] = str(content_snippet)
|
|
|
|
if "notes" in doc_dict:
|
|
notes_generator = tantivy.SnippetGenerator.create(
|
|
searcher,
|
|
user_query,
|
|
self._schema,
|
|
"notes",
|
|
)
|
|
notes_snippet = notes_generator.snippet_from_doc(actual_doc)
|
|
if notes_snippet:
|
|
highlights["notes"] = str(notes_snippet)
|
|
|
|
except Exception: # pragma: no cover
|
|
logger.debug("Failed to generate highlights for doc %s", doc_id)
|
|
|
|
hits.append(
|
|
SearchHit(
|
|
id=doc_id,
|
|
score=0.0,
|
|
rank=rank,
|
|
highlights=highlights,
|
|
),
|
|
)
|
|
|
|
return hits
|
|
|
|
def search_ids(
|
|
self,
|
|
query: str,
|
|
user: AbstractBaseUser | None,
|
|
*,
|
|
sort_field: str | None = None,
|
|
sort_reverse: bool = False,
|
|
search_mode: SearchMode = SearchMode.QUERY,
|
|
limit: int | None = None,
|
|
) -> list[int]:
|
|
"""
|
|
Return document IDs matching a query — no highlights or scores.
|
|
|
|
This is the lightweight companion to search(). Use it when you need the
|
|
full set of matching IDs (e.g. for ``selection_data``) but don't need
|
|
scores, ranks, or highlights.
|
|
|
|
Args:
|
|
query: User's search query
|
|
user: User for permission filtering (None for superuser/no filtering)
|
|
sort_field: Field to sort by (None for relevance ranking)
|
|
sort_reverse: Whether to reverse the sort order
|
|
search_mode: Query parsing mode (QUERY, TEXT, or TITLE)
|
|
limit: Maximum number of IDs to return (None = all matching docs)
|
|
|
|
Returns:
|
|
List of document IDs in the requested order
|
|
"""
|
|
self._ensure_open()
|
|
user_query = self._parse_query(query, search_mode)
|
|
final_query = self._apply_permission_filter(user_query, user)
|
|
|
|
searcher = self._index.searcher()
|
|
effective_limit = limit if limit is not None else searcher.num_docs
|
|
|
|
if sort_field and sort_field in self.SORT_FIELD_MAP:
|
|
mapped_field = self.SORT_FIELD_MAP[sort_field]
|
|
results = searcher.search(
|
|
final_query,
|
|
limit=effective_limit,
|
|
order_by_field=mapped_field,
|
|
order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
|
|
)
|
|
all_hits = [(hit[1],) for hit in results.hits]
|
|
else:
|
|
results = searcher.search(final_query, limit=effective_limit)
|
|
all_hits = [(hit[1], hit[0]) for hit in results.hits]
|
|
|
|
# Normalize scores and apply threshold (relevance search only)
|
|
if all_hits:
|
|
max_score = max(hit[1] for hit in all_hits) or 1.0
|
|
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
|
|
|
|
threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
|
|
if threshold is not None:
|
|
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
|
|
|
|
return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]
|
|
|
|
def autocomplete(
|
|
self,
|
|
term: str,
|
|
limit: int,
|
|
user: AbstractBaseUser | None = None,
|
|
) -> list[str]:
|
|
"""
|
|
Get autocomplete suggestions for search queries.
|
|
|
|
Returns words that start with the given term prefix, ranked by document
|
|
frequency (how many documents contain each word). Optionally filters
|
|
results to only words from documents visible to the specified user.
|
|
|
|
Args:
|
|
term: Prefix to match against autocomplete words
|
|
limit: Maximum number of suggestions to return
|
|
user: User for permission filtering (None for no filtering)
|
|
|
|
Returns:
|
|
List of word suggestions ordered by frequency, then alphabetically
|
|
"""
|
|
self._ensure_open()
|
|
normalized_term = ascii_fold(term.lower())
|
|
|
|
searcher = self._index.searcher()
|
|
|
|
# Apply permission filter for non-superusers so autocomplete words
|
|
# from invisible documents don't leak to other users.
|
|
if user is not None and not user.is_superuser:
|
|
base_query = build_permission_filter(self._schema, user)
|
|
else:
|
|
base_query = tantivy.Query.all_query()
|
|
|
|
results = searcher.search(base_query, limit=searcher.num_docs)
|
|
|
|
# Count how many visible documents each word appears in.
|
|
# Using Counter (not set) preserves per-word document frequency so
|
|
# we can rank suggestions by how commonly they occur — the same
|
|
# signal Whoosh used for Tf/Idf-based autocomplete ordering.
|
|
word_counts: Counter[str] = Counter()
|
|
for _score, doc_address in results.hits:
|
|
stored_doc = searcher.doc(doc_address)
|
|
doc_dict = stored_doc.to_dict()
|
|
if "autocomplete_word" in doc_dict:
|
|
word_counts.update(doc_dict["autocomplete_word"])
|
|
|
|
# Filter to prefix matches, sort by document frequency descending;
|
|
# break ties alphabetically for stable, deterministic output.
|
|
matches = sorted(
|
|
(w for w in word_counts if w.startswith(normalized_term)),
|
|
key=lambda w: (-word_counts[w], w),
|
|
)
|
|
|
|
return matches[:limit]
|
|
|
|
def more_like_this(
|
|
self,
|
|
doc_id: int,
|
|
user: AbstractBaseUser | None,
|
|
page: int,
|
|
page_size: int,
|
|
) -> SearchResults:
|
|
"""
|
|
Find documents similar to the given document using content analysis.
|
|
|
|
Uses Tantivy's "more like this" query to find documents with similar
|
|
content patterns. The original document is excluded from results.
|
|
|
|
Args:
|
|
doc_id: Primary key of the reference document
|
|
user: User for permission filtering (None for no filtering)
|
|
page: Page number (1-indexed) for pagination
|
|
page_size: Number of results per page
|
|
|
|
Returns:
|
|
SearchResults with similar documents (excluding the original)
|
|
"""
|
|
self._ensure_open()
|
|
searcher = self._index.searcher()
|
|
|
|
# First find the document address
|
|
id_query = tantivy.Query.range_query(
|
|
self._schema,
|
|
"id",
|
|
tantivy.FieldType.Unsigned,
|
|
doc_id,
|
|
doc_id,
|
|
)
|
|
results = searcher.search(id_query, limit=1)
|
|
|
|
if not results.hits:
|
|
# Document not found
|
|
return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}")
|
|
|
|
# Extract doc_address from (score, doc_address) tuple
|
|
doc_address = results.hits[0][1]
|
|
|
|
# Build more like this query
|
|
mlt_query = tantivy.Query.more_like_this_query(
|
|
doc_address,
|
|
min_doc_frequency=1,
|
|
max_doc_frequency=None,
|
|
min_term_frequency=1,
|
|
max_query_terms=12,
|
|
min_word_length=None,
|
|
max_word_length=None,
|
|
boost_factor=None,
|
|
)
|
|
|
|
final_query = self._apply_permission_filter(mlt_query, user)
|
|
|
|
# Search
|
|
offset = (page - 1) * page_size
|
|
results = searcher.search(final_query, limit=offset + page_size)
|
|
|
|
total = results.count
|
|
# Convert from (score, doc_address) to (doc_address, score)
|
|
all_hits = [(hit[1], hit[0]) for hit in results.hits]
|
|
|
|
# Normalize scores
|
|
if all_hits:
|
|
max_score = max(hit[1] for hit in all_hits) or 1.0
|
|
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
|
|
|
|
# Get page hits
|
|
page_hits = all_hits[offset : offset + page_size]
|
|
|
|
# Build results
|
|
hits: list[SearchHit] = []
|
|
for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
|
|
actual_doc = searcher.doc(doc_address)
|
|
doc_dict = actual_doc.to_dict()
|
|
result_doc_id = doc_dict["id"][0]
|
|
|
|
# Skip the original document
|
|
if result_doc_id == doc_id:
|
|
continue
|
|
|
|
hits.append(
|
|
SearchHit(
|
|
id=result_doc_id,
|
|
score=score,
|
|
rank=rank,
|
|
highlights={}, # MLT doesn't generate highlights
|
|
),
|
|
)
|
|
|
|
return SearchResults(
|
|
hits=hits,
|
|
total=max(0, total - 1), # Subtract 1 for the original document
|
|
query=f"more_like:{doc_id}",
|
|
)
|
|
|
|
def more_like_this_ids(
|
|
self,
|
|
doc_id: int,
|
|
user: AbstractBaseUser | None,
|
|
*,
|
|
limit: int | None = None,
|
|
) -> list[int]:
|
|
"""
|
|
Return IDs of documents similar to the given document — no highlights.
|
|
|
|
Lightweight companion to more_like_this(). The original document is
|
|
excluded from results.
|
|
|
|
Args:
|
|
doc_id: Primary key of the reference document
|
|
user: User for permission filtering (None for no filtering)
|
|
limit: Maximum number of IDs to return (None = all matching docs)
|
|
|
|
Returns:
|
|
List of similar document IDs (excluding the original)
|
|
"""
|
|
self._ensure_open()
|
|
searcher = self._index.searcher()
|
|
|
|
id_query = tantivy.Query.range_query(
|
|
self._schema,
|
|
"id",
|
|
tantivy.FieldType.Unsigned,
|
|
doc_id,
|
|
doc_id,
|
|
)
|
|
results = searcher.search(id_query, limit=1)
|
|
|
|
if not results.hits:
|
|
return []
|
|
|
|
doc_address = results.hits[0][1]
|
|
mlt_query = tantivy.Query.more_like_this_query(
|
|
doc_address,
|
|
min_doc_frequency=1,
|
|
max_doc_frequency=None,
|
|
min_term_frequency=1,
|
|
max_query_terms=12,
|
|
min_word_length=None,
|
|
max_word_length=None,
|
|
boost_factor=None,
|
|
)
|
|
|
|
final_query = self._apply_permission_filter(mlt_query, user)
|
|
|
|
effective_limit = limit if limit is not None else searcher.num_docs
|
|
results = searcher.search(final_query, limit=effective_limit)
|
|
|
|
ids = []
|
|
for _score, doc_address in results.hits:
|
|
result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
|
|
if result_doc_id != doc_id:
|
|
ids.append(result_doc_id)
|
|
return ids
|
|
|
|
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
|
|
"""
|
|
Get a batch context manager for bulk index operations.
|
|
|
|
Use this for efficient bulk document updates/deletions. All operations
|
|
within the batch are committed atomically at the end of the context.
|
|
|
|
Args:
|
|
lock_timeout: Seconds to wait for file lock acquisition
|
|
|
|
Returns:
|
|
WriteBatch context manager
|
|
|
|
Raises:
|
|
SearchIndexLockError: If lock cannot be acquired within timeout
|
|
"""
|
|
self._ensure_open()
|
|
return WriteBatch(self, lock_timeout)
|
|
|
|
def rebuild(
|
|
self,
|
|
documents: QuerySet[Document],
|
|
iter_wrapper: IterWrapper[Document] = identity,
|
|
) -> None:
|
|
"""
|
|
Rebuild the entire search index from scratch.
|
|
|
|
Wipes the existing index and re-indexes all provided documents.
|
|
On failure, restores the previous index state to keep the backend usable.
|
|
|
|
Args:
|
|
documents: QuerySet of Document instances to index
|
|
iter_wrapper: Optional wrapper function for progress tracking
|
|
(e.g., progress bar). Should yield each document unchanged.
|
|
"""
|
|
# Create new index (on-disk or in-memory)
|
|
if self._path is not None:
|
|
wipe_index(self._path)
|
|
new_index = tantivy.Index(build_schema(), path=str(self._path))
|
|
_write_sentinels(self._path)
|
|
else:
|
|
new_index = tantivy.Index(build_schema())
|
|
register_tokenizers(new_index, settings.SEARCH_LANGUAGE)
|
|
|
|
# Point instance at the new index so _build_tantivy_doc uses it
|
|
old_index, old_schema = self._index, self._schema
|
|
self._index = new_index
|
|
self._schema = new_index.schema
|
|
|
|
try:
|
|
writer = new_index.writer()
|
|
for document in iter_wrapper(documents):
|
|
doc = self._build_tantivy_doc(
|
|
document,
|
|
document.get_effective_content(),
|
|
)
|
|
writer.add_document(doc)
|
|
writer.commit()
|
|
new_index.reload()
|
|
except BaseException: # pragma: no cover
|
|
# Restore old index on failure so the backend remains usable
|
|
self._index = old_index
|
|
self._schema = old_schema
|
|
raise
|
|
|
|
|
|
# Module-level singleton with proper thread safety
|
|
_backend: TantivyBackend | None = None
|
|
_backend_path: Path | None = None # tracks which INDEX_DIR the singleton uses
|
|
_backend_lock = threading.RLock()
|
|
|
|
|
|
def get_backend() -> TantivyBackend:
|
|
"""
|
|
Get the global backend instance with thread safety.
|
|
|
|
Returns a singleton TantivyBackend instance, automatically reinitializing
|
|
when settings.INDEX_DIR changes. This ensures proper test isolation when
|
|
using pytest-xdist or @override_settings that change the index directory.
|
|
|
|
Returns:
|
|
Thread-safe singleton TantivyBackend instance
|
|
"""
|
|
global _backend, _backend_path
|
|
|
|
current_path: Path = settings.INDEX_DIR
|
|
|
|
# Fast path: backend is initialized and path hasn't changed (no lock needed)
|
|
if _backend is not None and _backend_path == current_path:
|
|
return _backend
|
|
|
|
# Slow path: first call, or INDEX_DIR changed between calls
|
|
with _backend_lock:
|
|
# Double-check after acquiring lock — another thread may have beaten us
|
|
if _backend is not None and _backend_path == current_path:
|
|
return _backend # pragma: no cover
|
|
|
|
if _backend is not None:
|
|
_backend.close()
|
|
|
|
_backend = TantivyBackend(path=current_path)
|
|
_backend.open()
|
|
_backend_path = current_path
|
|
|
|
return _backend
|
|
|
|
|
|
def reset_backend() -> None:
|
|
"""
|
|
Reset the global backend instance with thread safety.
|
|
|
|
Forces creation of a new backend instance on the next get_backend() call.
|
|
Used for test isolation and when switching between different index directories.
|
|
"""
|
|
global _backend, _backend_path
|
|
|
|
with _backend_lock:
|
|
if _backend is not None:
|
|
_backend.close()
|
|
_backend = None
|
|
_backend_path = None
|