From 34d2897ab19c509c46ed5b1d08d65b4e07f81dcf Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 30 Mar 2026 12:49:46 -0700 Subject: [PATCH] refactor(search): replace context manager smell with explicit open/close lifecycle TantivyBackend now uses open()/close()/_ensure_open() instead of __enter__/__exit__. get_backend() tracks _backend_path and auto-reinitializes when settings.INDEX_DIR changes, fixing the xdist/override_settings isolation bug where parallel workers would share a stale singleton pointing at a deleted index directory. Test fixtures use in-memory indices (path=None) for speed and isolation. Singleton behavior covered by TestSingleton in test_backend.py. Co-Authored-By: Claude Sonnet 4.6 --- src/documents/search/_backend.py | 108 ++++++++++++++------- src/documents/search/_schema.py | 8 +- src/documents/tests/search/conftest.py | 11 ++- src/documents/tests/search/test_backend.py | 33 +++++++ 4 files changed, 119 insertions(+), 41 deletions(-) diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 56e5342f0..b9be04a0a 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -30,6 +30,7 @@ from documents.search._tokenizer import register_tokenizers if TYPE_CHECKING: from collections.abc import Callable from collections.abc import Iterable + from pathlib import Path from django.contrib.auth.base_user import AbstractBaseUser from django.db.models import QuerySet @@ -124,17 +125,18 @@ class WriteBatch: self._backend = backend self._lock_timeout = lock_timeout self._writer = None + self._lock = None def __enter__(self) -> Self: - lock_path = settings.INDEX_DIR / ".tantivy.lock" - self._lock = filelock.FileLock(str(lock_path)) - - try: - self._lock.acquire(timeout=self._lock_timeout) - except filelock.Timeout as e: - raise SearchIndexLockError( - f"Could not acquire index lock within {self._lock_timeout}s", - ) from e + if self._backend._path is not None: + lock_path = self._backend._path / ".tantivy.lock" + self._lock = filelock.FileLock(str(lock_path)) + try: + self._lock.acquire(timeout=self._lock_timeout) + except filelock.Timeout as e: + raise SearchIndexLockError( + f"Could not acquire index lock within {self._lock_timeout}s", + ) from e self._writer = self._backend._index.writer() return self @@ -154,7 +156,7 @@ class WriteBatch: del self._writer self._writer = None finally: - if hasattr(self, "_lock") and self._lock: + if self._lock is not None: self._lock.release() def add_or_update( @@ -187,21 +189,35 @@ class WriteBatch: class TantivyBackend: - """Tantivy search backend with context manager interface.""" + """Tantivy search backend with explicit lifecycle management.""" - def __init__(self): + def __init__(self, path: Path | None = None): + # path=None → in-memory index (for tests) + # path=some_dir → on-disk index (for production) + self._path = path self._index = None self._schema = None - def __enter__(self) -> Self: - self._index = open_or_rebuild_index() + def open(self) -> None: + """Open or rebuild the index. Idempotent.""" + if self._index is not None: + return + if self._path is not None: + self._index = open_or_rebuild_index(self._path) + else: + self._index = tantivy.Index(build_schema()) register_tokenizers(self._index, settings.SEARCH_LANGUAGE) self._schema = self._index.schema - return self - def __exit__(self, exc_type, exc_val, exc_tb): - # Index doesn't need explicit close - pass + def close(self) -> None: + """Close the index. Idempotent.""" + self._index = None + self._schema = None + + def _ensure_open(self) -> None: + """Ensure the index is open before operations.""" + if self._index is None: + self.open() def _build_tantivy_doc( self, @@ -331,11 +347,13 @@ class TantivyBackend: effective_content: str | None = None, ) -> None: """Add or update a single document with file locking.""" + self._ensure_open() with self.batch_update(lock_timeout=5.0) as batch: batch.add_or_update(document, effective_content) def remove(self, doc_id: int) -> None: """Remove a single document with file locking.""" + self._ensure_open() with self.batch_update(lock_timeout=5.0) as batch: batch.remove(doc_id) @@ -350,6 +368,7 @@ class TantivyBackend: sort_reverse: bool, ) -> SearchResults: """Search the index.""" + self._ensure_open() tz = get_current_timezone() user_query = parse_user_query(self._index, query, tz) @@ -491,6 +510,7 @@ class TantivyBackend: user: AbstractBaseUser | None = None, ) -> list[str]: """Get autocomplete suggestions, optionally filtered by user visibility.""" + self._ensure_open() normalized_term = _ascii_fold(term.lower()) searcher = self._index.searcher() @@ -543,6 +563,7 @@ class TantivyBackend: page_size: int, ) -> SearchResults: """Find documents similar to the given document.""" + self._ensure_open() searcher = self._index.searcher() # First find the document address @@ -630,18 +651,20 @@ class TantivyBackend: def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch: """Get a batch context manager for bulk operations.""" + self._ensure_open() return WriteBatch(self, lock_timeout) def rebuild(self, documents: QuerySet, iter_wrapper: Callable = _identity) -> None: """Rebuild the entire search index.""" from documents.search._tokenizer import register_tokenizers - index_dir = settings.INDEX_DIR - - # Create new index - wipe_index(index_dir) - new_index = tantivy.Index(build_schema(), path=str(index_dir)) - _write_sentinels(index_dir) + # Create new index (on-disk or in-memory) + if self._path is not None: + wipe_index(self._path) + new_index = tantivy.Index(build_schema(), path=str(self._path)) + _write_sentinels(self._path) + else: + new_index = tantivy.Index(build_schema()) register_tokenizers(new_index, settings.SEARCH_LANGUAGE) # Index all documents using the new index @@ -672,30 +695,47 @@ class TantivyBackend: # Module-level singleton with proper thread safety _backend: TantivyBackend | None = None +_backend_path: Path | None = None # tracks which INDEX_DIR the singleton uses _backend_lock = threading.RLock() def get_backend() -> TantivyBackend: - """Get the global backend instance with thread safety.""" - global _backend + """Get the global backend instance with thread safety. - # Fast path for already initialized backend - if _backend is not None: + Automatically reinitializes when settings.INDEX_DIR changes — this fixes + the xdist/override_settings isolation issue where each test may set a + different INDEX_DIR but would otherwise share a stale singleton. + """ + global _backend, _backend_path + + current_path: Path = settings.INDEX_DIR + + # Fast path: backend is initialized and path hasn't changed (no lock needed) + if _backend is not None and _backend_path == current_path: return _backend - # Slow path with locking + # Slow path: first call, or INDEX_DIR changed between calls with _backend_lock: - if _backend is None: - _backend = TantivyBackend() - _backend.__enter__() + # Double-check after acquiring lock — another thread may have beaten us + if _backend is not None and _backend_path == current_path: + return _backend + + if _backend is not None: + _backend.close() + + _backend = TantivyBackend(path=current_path) + _backend.open() + _backend_path = current_path + return _backend def reset_backend() -> None: """Reset the global backend instance with thread safety.""" - global _backend + global _backend, _backend_path with _backend_lock: if _backend is not None: - _backend.__exit__(None, None, None) + _backend.close() _backend = None + _backend_path = None diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py index ea2c7d188..d41aeeb55 100644 --- a/src/documents/search/_schema.py +++ b/src/documents/search/_schema.py @@ -116,12 +116,14 @@ def _write_sentinels(index_dir: Path) -> None: (index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE) -def open_or_rebuild_index() -> tantivy.Index: +def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index: """ - Open the Tantivy index at settings.INDEX_DIR, creating or rebuilding as needed. + Open the Tantivy index at index_dir (defaults to settings.INDEX_DIR), + creating or rebuilding as needed. Caller must register custom tokenizers after receiving the Index. """ - index_dir: Path = settings.INDEX_DIR + if index_dir is None: + index_dir = settings.INDEX_DIR if _needs_rebuild(index_dir): wipe_index(index_dir) idx = tantivy.Index(build_schema(), path=str(index_dir)) diff --git a/src/documents/tests/search/conftest.py b/src/documents/tests/search/conftest.py index 6946649e9..ccc26d695 100644 --- a/src/documents/tests/search/conftest.py +++ b/src/documents/tests/search/conftest.py @@ -23,8 +23,11 @@ def index_dir(tmp_path: Path, settings: SettingsWrapper) -> Path: @pytest.fixture -def backend(index_dir: Path) -> Generator[TantivyBackend, None, None]: - b = TantivyBackend() - with b: +def backend() -> Generator[TantivyBackend, None, None]: + b = TantivyBackend() # path=None → in-memory index + b.open() + try: yield b - reset_backend() + finally: + b.close() + reset_backend() diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index 20608d035..bc4d91357 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -6,6 +6,8 @@ from documents.models import CustomFieldInstance from documents.models import Document from documents.models import Note from documents.search._backend import TantivyBackend +from documents.search._backend import get_backend +from documents.search._backend import reset_backend pytestmark = [pytest.mark.search, pytest.mark.django_db] @@ -161,6 +163,37 @@ class TestMoreLikeThis: assert 50 not in returned_ids # Original document excluded +class TestSingleton: + """Test get_backend() and reset_backend() singleton lifecycle.""" + + @pytest.fixture(autouse=True) + def _clean(self): + reset_backend() + yield + reset_backend() + + def test_returns_same_instance_on_repeated_calls(self, index_dir): + assert get_backend() is get_backend() + + def test_reinitializes_when_index_dir_changes(self, tmp_path, settings): + settings.INDEX_DIR = tmp_path / "a" + (tmp_path / "a").mkdir() + b1 = get_backend() + + settings.INDEX_DIR = tmp_path / "b" + (tmp_path / "b").mkdir() + b2 = get_backend() + + assert b1 is not b2 + assert b2._path == tmp_path / "b" + + def test_reset_forces_new_instance(self, index_dir): + b1 = get_backend() + reset_backend() + b2 = get_backend() + assert b1 is not b2 + + class TestFieldHandling: """Test handling of various document fields."""