refactor(search): replace context manager smell with explicit open/close lifecycle

TantivyBackend now uses open()/close()/_ensure_open() instead of __enter__/__exit__.
get_backend() tracks _backend_path and auto-reinitializes when settings.INDEX_DIR
changes, fixing the xdist/override_settings isolation bug where parallel workers
would share a stale singleton pointing at a deleted index directory.

Test fixtures use in-memory indices (path=None) for speed and isolation.
Singleton behavior covered by TestSingleton in test_backend.py.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-30 12:49:46 -07:00
parent 50f6b2d4c3
commit 34d2897ab1
4 changed files with 119 additions and 41 deletions

View File

@@ -30,6 +30,7 @@ from documents.search._tokenizer import register_tokenizers
if TYPE_CHECKING:
from collections.abc import Callable
from collections.abc import Iterable
from pathlib import Path
from django.contrib.auth.base_user import AbstractBaseUser
from django.db.models import QuerySet
@@ -124,17 +125,18 @@ class WriteBatch:
self._backend = backend
self._lock_timeout = lock_timeout
self._writer = None
self._lock = None
def __enter__(self) -> Self:
lock_path = settings.INDEX_DIR / ".tantivy.lock"
self._lock = filelock.FileLock(str(lock_path))
try:
self._lock.acquire(timeout=self._lock_timeout)
except filelock.Timeout as e:
raise SearchIndexLockError(
f"Could not acquire index lock within {self._lock_timeout}s",
) from e
if self._backend._path is not None:
lock_path = self._backend._path / ".tantivy.lock"
self._lock = filelock.FileLock(str(lock_path))
try:
self._lock.acquire(timeout=self._lock_timeout)
except filelock.Timeout as e:
raise SearchIndexLockError(
f"Could not acquire index lock within {self._lock_timeout}s",
) from e
self._writer = self._backend._index.writer()
return self
@@ -154,7 +156,7 @@ class WriteBatch:
del self._writer
self._writer = None
finally:
if hasattr(self, "_lock") and self._lock:
if self._lock is not None:
self._lock.release()
def add_or_update(
@@ -187,21 +189,35 @@ class WriteBatch:
class TantivyBackend:
"""Tantivy search backend with context manager interface."""
"""Tantivy search backend with explicit lifecycle management."""
def __init__(self):
def __init__(self, path: Path | None = None):
# path=None → in-memory index (for tests)
# path=some_dir → on-disk index (for production)
self._path = path
self._index = None
self._schema = None
def __enter__(self) -> Self:
self._index = open_or_rebuild_index()
def open(self) -> None:
"""Open or rebuild the index. Idempotent."""
if self._index is not None:
return
if self._path is not None:
self._index = open_or_rebuild_index(self._path)
else:
self._index = tantivy.Index(build_schema())
register_tokenizers(self._index, settings.SEARCH_LANGUAGE)
self._schema = self._index.schema
return self
def __exit__(self, exc_type, exc_val, exc_tb):
# Index doesn't need explicit close
pass
def close(self) -> None:
"""Close the index. Idempotent."""
self._index = None
self._schema = None
def _ensure_open(self) -> None:
"""Ensure the index is open before operations."""
if self._index is None:
self.open()
def _build_tantivy_doc(
self,
@@ -331,11 +347,13 @@ class TantivyBackend:
effective_content: str | None = None,
) -> None:
"""Add or update a single document with file locking."""
self._ensure_open()
with self.batch_update(lock_timeout=5.0) as batch:
batch.add_or_update(document, effective_content)
def remove(self, doc_id: int) -> None:
"""Remove a single document with file locking."""
self._ensure_open()
with self.batch_update(lock_timeout=5.0) as batch:
batch.remove(doc_id)
@@ -350,6 +368,7 @@ class TantivyBackend:
sort_reverse: bool,
) -> SearchResults:
"""Search the index."""
self._ensure_open()
tz = get_current_timezone()
user_query = parse_user_query(self._index, query, tz)
@@ -491,6 +510,7 @@ class TantivyBackend:
user: AbstractBaseUser | None = None,
) -> list[str]:
"""Get autocomplete suggestions, optionally filtered by user visibility."""
self._ensure_open()
normalized_term = _ascii_fold(term.lower())
searcher = self._index.searcher()
@@ -543,6 +563,7 @@ class TantivyBackend:
page_size: int,
) -> SearchResults:
"""Find documents similar to the given document."""
self._ensure_open()
searcher = self._index.searcher()
# First find the document address
@@ -630,18 +651,20 @@ class TantivyBackend:
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
"""Get a batch context manager for bulk operations."""
self._ensure_open()
return WriteBatch(self, lock_timeout)
def rebuild(self, documents: QuerySet, iter_wrapper: Callable = _identity) -> None:
"""Rebuild the entire search index."""
from documents.search._tokenizer import register_tokenizers
index_dir = settings.INDEX_DIR
# Create new index
wipe_index(index_dir)
new_index = tantivy.Index(build_schema(), path=str(index_dir))
_write_sentinels(index_dir)
# Create new index (on-disk or in-memory)
if self._path is not None:
wipe_index(self._path)
new_index = tantivy.Index(build_schema(), path=str(self._path))
_write_sentinels(self._path)
else:
new_index = tantivy.Index(build_schema())
register_tokenizers(new_index, settings.SEARCH_LANGUAGE)
# Index all documents using the new index
@@ -672,30 +695,47 @@ class TantivyBackend:
# Module-level singleton with proper thread safety
_backend: TantivyBackend | None = None
_backend_path: Path | None = None # tracks which INDEX_DIR the singleton uses
_backend_lock = threading.RLock()
def get_backend() -> TantivyBackend:
"""Get the global backend instance with thread safety."""
global _backend
"""Get the global backend instance with thread safety.
# Fast path for already initialized backend
if _backend is not None:
Automatically reinitializes when settings.INDEX_DIR changes — this fixes
the xdist/override_settings isolation issue where each test may set a
different INDEX_DIR but would otherwise share a stale singleton.
"""
global _backend, _backend_path
current_path: Path = settings.INDEX_DIR
# Fast path: backend is initialized and path hasn't changed (no lock needed)
if _backend is not None and _backend_path == current_path:
return _backend
# Slow path with locking
# Slow path: first call, or INDEX_DIR changed between calls
with _backend_lock:
if _backend is None:
_backend = TantivyBackend()
_backend.__enter__()
# Double-check after acquiring lock — another thread may have beaten us
if _backend is not None and _backend_path == current_path:
return _backend
if _backend is not None:
_backend.close()
_backend = TantivyBackend(path=current_path)
_backend.open()
_backend_path = current_path
return _backend
def reset_backend() -> None:
"""Reset the global backend instance with thread safety."""
global _backend
global _backend, _backend_path
with _backend_lock:
if _backend is not None:
_backend.__exit__(None, None, None)
_backend.close()
_backend = None
_backend_path = None

View File

@@ -116,12 +116,14 @@ def _write_sentinels(index_dir: Path) -> None:
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE)
def open_or_rebuild_index() -> tantivy.Index:
def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
"""
Open the Tantivy index at settings.INDEX_DIR, creating or rebuilding as needed.
Open the Tantivy index at index_dir (defaults to settings.INDEX_DIR),
creating or rebuilding as needed.
Caller must register custom tokenizers after receiving the Index.
"""
index_dir: Path = settings.INDEX_DIR
if index_dir is None:
index_dir = settings.INDEX_DIR
if _needs_rebuild(index_dir):
wipe_index(index_dir)
idx = tantivy.Index(build_schema(), path=str(index_dir))

View File

@@ -23,8 +23,11 @@ def index_dir(tmp_path: Path, settings: SettingsWrapper) -> Path:
@pytest.fixture
def backend(index_dir: Path) -> Generator[TantivyBackend, None, None]:
b = TantivyBackend()
with b:
def backend() -> Generator[TantivyBackend, None, None]:
b = TantivyBackend() # path=None → in-memory index
b.open()
try:
yield b
reset_backend()
finally:
b.close()
reset_backend()

View File

@@ -6,6 +6,8 @@ from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import Note
from documents.search._backend import TantivyBackend
from documents.search._backend import get_backend
from documents.search._backend import reset_backend
pytestmark = [pytest.mark.search, pytest.mark.django_db]
@@ -161,6 +163,37 @@ class TestMoreLikeThis:
assert 50 not in returned_ids # Original document excluded
class TestSingleton:
"""Test get_backend() and reset_backend() singleton lifecycle."""
@pytest.fixture(autouse=True)
def _clean(self):
reset_backend()
yield
reset_backend()
def test_returns_same_instance_on_repeated_calls(self, index_dir):
assert get_backend() is get_backend()
def test_reinitializes_when_index_dir_changes(self, tmp_path, settings):
settings.INDEX_DIR = tmp_path / "a"
(tmp_path / "a").mkdir()
b1 = get_backend()
settings.INDEX_DIR = tmp_path / "b"
(tmp_path / "b").mkdir()
b2 = get_backend()
assert b1 is not b2
assert b2._path == tmp_path / "b"
def test_reset_forces_new_instance(self, index_dir):
b1 = get_backend()
reset_backend()
b2 = get_backend()
assert b1 is not b2
class TestFieldHandling:
"""Test handling of various document fields."""