mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-21 23:39:28 +00:00
refactor(search): replace context manager smell with explicit open/close lifecycle
TantivyBackend now uses open()/close()/_ensure_open() instead of __enter__/__exit__. get_backend() tracks _backend_path and auto-reinitializes when settings.INDEX_DIR changes, fixing the xdist/override_settings isolation bug where parallel workers would share a stale singleton pointing at a deleted index directory. Test fixtures use in-memory indices (path=None) for speed and isolation. Singleton behavior covered by TestSingleton in test_backend.py. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -30,6 +30,7 @@ from documents.search._tokenizer import register_tokenizers
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
|
||||
from django.contrib.auth.base_user import AbstractBaseUser
|
||||
from django.db.models import QuerySet
|
||||
@@ -124,17 +125,18 @@ class WriteBatch:
|
||||
self._backend = backend
|
||||
self._lock_timeout = lock_timeout
|
||||
self._writer = None
|
||||
self._lock = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
lock_path = settings.INDEX_DIR / ".tantivy.lock"
|
||||
self._lock = filelock.FileLock(str(lock_path))
|
||||
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout)
|
||||
except filelock.Timeout as e:
|
||||
raise SearchIndexLockError(
|
||||
f"Could not acquire index lock within {self._lock_timeout}s",
|
||||
) from e
|
||||
if self._backend._path is not None:
|
||||
lock_path = self._backend._path / ".tantivy.lock"
|
||||
self._lock = filelock.FileLock(str(lock_path))
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout)
|
||||
except filelock.Timeout as e:
|
||||
raise SearchIndexLockError(
|
||||
f"Could not acquire index lock within {self._lock_timeout}s",
|
||||
) from e
|
||||
|
||||
self._writer = self._backend._index.writer()
|
||||
return self
|
||||
@@ -154,7 +156,7 @@ class WriteBatch:
|
||||
del self._writer
|
||||
self._writer = None
|
||||
finally:
|
||||
if hasattr(self, "_lock") and self._lock:
|
||||
if self._lock is not None:
|
||||
self._lock.release()
|
||||
|
||||
def add_or_update(
|
||||
@@ -187,21 +189,35 @@ class WriteBatch:
|
||||
|
||||
|
||||
class TantivyBackend:
|
||||
"""Tantivy search backend with context manager interface."""
|
||||
"""Tantivy search backend with explicit lifecycle management."""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, path: Path | None = None):
|
||||
# path=None → in-memory index (for tests)
|
||||
# path=some_dir → on-disk index (for production)
|
||||
self._path = path
|
||||
self._index = None
|
||||
self._schema = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
self._index = open_or_rebuild_index()
|
||||
def open(self) -> None:
|
||||
"""Open or rebuild the index. Idempotent."""
|
||||
if self._index is not None:
|
||||
return
|
||||
if self._path is not None:
|
||||
self._index = open_or_rebuild_index(self._path)
|
||||
else:
|
||||
self._index = tantivy.Index(build_schema())
|
||||
register_tokenizers(self._index, settings.SEARCH_LANGUAGE)
|
||||
self._schema = self._index.schema
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
# Index doesn't need explicit close
|
||||
pass
|
||||
def close(self) -> None:
|
||||
"""Close the index. Idempotent."""
|
||||
self._index = None
|
||||
self._schema = None
|
||||
|
||||
def _ensure_open(self) -> None:
|
||||
"""Ensure the index is open before operations."""
|
||||
if self._index is None:
|
||||
self.open()
|
||||
|
||||
def _build_tantivy_doc(
|
||||
self,
|
||||
@@ -331,11 +347,13 @@ class TantivyBackend:
|
||||
effective_content: str | None = None,
|
||||
) -> None:
|
||||
"""Add or update a single document with file locking."""
|
||||
self._ensure_open()
|
||||
with self.batch_update(lock_timeout=5.0) as batch:
|
||||
batch.add_or_update(document, effective_content)
|
||||
|
||||
def remove(self, doc_id: int) -> None:
|
||||
"""Remove a single document with file locking."""
|
||||
self._ensure_open()
|
||||
with self.batch_update(lock_timeout=5.0) as batch:
|
||||
batch.remove(doc_id)
|
||||
|
||||
@@ -350,6 +368,7 @@ class TantivyBackend:
|
||||
sort_reverse: bool,
|
||||
) -> SearchResults:
|
||||
"""Search the index."""
|
||||
self._ensure_open()
|
||||
tz = get_current_timezone()
|
||||
user_query = parse_user_query(self._index, query, tz)
|
||||
|
||||
@@ -491,6 +510,7 @@ class TantivyBackend:
|
||||
user: AbstractBaseUser | None = None,
|
||||
) -> list[str]:
|
||||
"""Get autocomplete suggestions, optionally filtered by user visibility."""
|
||||
self._ensure_open()
|
||||
normalized_term = _ascii_fold(term.lower())
|
||||
|
||||
searcher = self._index.searcher()
|
||||
@@ -543,6 +563,7 @@ class TantivyBackend:
|
||||
page_size: int,
|
||||
) -> SearchResults:
|
||||
"""Find documents similar to the given document."""
|
||||
self._ensure_open()
|
||||
searcher = self._index.searcher()
|
||||
|
||||
# First find the document address
|
||||
@@ -630,18 +651,20 @@ class TantivyBackend:
|
||||
|
||||
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
|
||||
"""Get a batch context manager for bulk operations."""
|
||||
self._ensure_open()
|
||||
return WriteBatch(self, lock_timeout)
|
||||
|
||||
def rebuild(self, documents: QuerySet, iter_wrapper: Callable = _identity) -> None:
|
||||
"""Rebuild the entire search index."""
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
|
||||
index_dir = settings.INDEX_DIR
|
||||
|
||||
# Create new index
|
||||
wipe_index(index_dir)
|
||||
new_index = tantivy.Index(build_schema(), path=str(index_dir))
|
||||
_write_sentinels(index_dir)
|
||||
# Create new index (on-disk or in-memory)
|
||||
if self._path is not None:
|
||||
wipe_index(self._path)
|
||||
new_index = tantivy.Index(build_schema(), path=str(self._path))
|
||||
_write_sentinels(self._path)
|
||||
else:
|
||||
new_index = tantivy.Index(build_schema())
|
||||
register_tokenizers(new_index, settings.SEARCH_LANGUAGE)
|
||||
|
||||
# Index all documents using the new index
|
||||
@@ -672,30 +695,47 @@ class TantivyBackend:
|
||||
|
||||
# Module-level singleton with proper thread safety
|
||||
_backend: TantivyBackend | None = None
|
||||
_backend_path: Path | None = None # tracks which INDEX_DIR the singleton uses
|
||||
_backend_lock = threading.RLock()
|
||||
|
||||
|
||||
def get_backend() -> TantivyBackend:
|
||||
"""Get the global backend instance with thread safety."""
|
||||
global _backend
|
||||
"""Get the global backend instance with thread safety.
|
||||
|
||||
# Fast path for already initialized backend
|
||||
if _backend is not None:
|
||||
Automatically reinitializes when settings.INDEX_DIR changes — this fixes
|
||||
the xdist/override_settings isolation issue where each test may set a
|
||||
different INDEX_DIR but would otherwise share a stale singleton.
|
||||
"""
|
||||
global _backend, _backend_path
|
||||
|
||||
current_path: Path = settings.INDEX_DIR
|
||||
|
||||
# Fast path: backend is initialized and path hasn't changed (no lock needed)
|
||||
if _backend is not None and _backend_path == current_path:
|
||||
return _backend
|
||||
|
||||
# Slow path with locking
|
||||
# Slow path: first call, or INDEX_DIR changed between calls
|
||||
with _backend_lock:
|
||||
if _backend is None:
|
||||
_backend = TantivyBackend()
|
||||
_backend.__enter__()
|
||||
# Double-check after acquiring lock — another thread may have beaten us
|
||||
if _backend is not None and _backend_path == current_path:
|
||||
return _backend
|
||||
|
||||
if _backend is not None:
|
||||
_backend.close()
|
||||
|
||||
_backend = TantivyBackend(path=current_path)
|
||||
_backend.open()
|
||||
_backend_path = current_path
|
||||
|
||||
return _backend
|
||||
|
||||
|
||||
def reset_backend() -> None:
|
||||
"""Reset the global backend instance with thread safety."""
|
||||
global _backend
|
||||
global _backend, _backend_path
|
||||
|
||||
with _backend_lock:
|
||||
if _backend is not None:
|
||||
_backend.__exit__(None, None, None)
|
||||
_backend.close()
|
||||
_backend = None
|
||||
_backend_path = None
|
||||
|
||||
@@ -116,12 +116,14 @@ def _write_sentinels(index_dir: Path) -> None:
|
||||
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE)
|
||||
|
||||
|
||||
def open_or_rebuild_index() -> tantivy.Index:
|
||||
def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
|
||||
"""
|
||||
Open the Tantivy index at settings.INDEX_DIR, creating or rebuilding as needed.
|
||||
Open the Tantivy index at index_dir (defaults to settings.INDEX_DIR),
|
||||
creating or rebuilding as needed.
|
||||
Caller must register custom tokenizers after receiving the Index.
|
||||
"""
|
||||
index_dir: Path = settings.INDEX_DIR
|
||||
if index_dir is None:
|
||||
index_dir = settings.INDEX_DIR
|
||||
if _needs_rebuild(index_dir):
|
||||
wipe_index(index_dir)
|
||||
idx = tantivy.Index(build_schema(), path=str(index_dir))
|
||||
|
||||
@@ -23,8 +23,11 @@ def index_dir(tmp_path: Path, settings: SettingsWrapper) -> Path:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def backend(index_dir: Path) -> Generator[TantivyBackend, None, None]:
|
||||
b = TantivyBackend()
|
||||
with b:
|
||||
def backend() -> Generator[TantivyBackend, None, None]:
|
||||
b = TantivyBackend() # path=None → in-memory index
|
||||
b.open()
|
||||
try:
|
||||
yield b
|
||||
reset_backend()
|
||||
finally:
|
||||
b.close()
|
||||
reset_backend()
|
||||
|
||||
@@ -6,6 +6,8 @@ from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import Note
|
||||
from documents.search._backend import TantivyBackend
|
||||
from documents.search._backend import get_backend
|
||||
from documents.search._backend import reset_backend
|
||||
|
||||
pytestmark = [pytest.mark.search, pytest.mark.django_db]
|
||||
|
||||
@@ -161,6 +163,37 @@ class TestMoreLikeThis:
|
||||
assert 50 not in returned_ids # Original document excluded
|
||||
|
||||
|
||||
class TestSingleton:
|
||||
"""Test get_backend() and reset_backend() singleton lifecycle."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _clean(self):
|
||||
reset_backend()
|
||||
yield
|
||||
reset_backend()
|
||||
|
||||
def test_returns_same_instance_on_repeated_calls(self, index_dir):
|
||||
assert get_backend() is get_backend()
|
||||
|
||||
def test_reinitializes_when_index_dir_changes(self, tmp_path, settings):
|
||||
settings.INDEX_DIR = tmp_path / "a"
|
||||
(tmp_path / "a").mkdir()
|
||||
b1 = get_backend()
|
||||
|
||||
settings.INDEX_DIR = tmp_path / "b"
|
||||
(tmp_path / "b").mkdir()
|
||||
b2 = get_backend()
|
||||
|
||||
assert b1 is not b2
|
||||
assert b2._path == tmp_path / "b"
|
||||
|
||||
def test_reset_forces_new_instance(self, index_dir):
|
||||
b1 = get_backend()
|
||||
reset_backend()
|
||||
b2 = get_backend()
|
||||
assert b1 is not b2
|
||||
|
||||
|
||||
class TestFieldHandling:
|
||||
"""Test handling of various document fields."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user