diff --git a/pyproject.toml b/pyproject.toml index bfc8c7577..2fe5d9f3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -143,7 +143,8 @@ typing = [ "types-python-dateutil", "types-pytz", "types-redis", - "types-setuptools", + "types-regex", + "types-setuptools" ] [tool.uv] diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 9dcb45c05..8c745dfd7 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING from typing import Self from typing import TypedDict from typing import TypeVar +from typing import cast import filelock import regex @@ -35,10 +36,8 @@ from documents.utils import identity if TYPE_CHECKING: from pathlib import Path - from django.contrib.auth.base_user import AbstractUser + from django.contrib.auth.models import AbstractUser from django.db.models import QuerySet - from tantivy import Index - from tantivy import Schema from documents.models import Document @@ -170,9 +169,16 @@ class WriteBatch: def __init__(self, backend: TantivyBackend, lock_timeout: float): self._backend = backend self._lock_timeout = lock_timeout - self._writer = None + self._raw_writer: tantivy.IndexWriter | None = None self._lock = None + @property + def _writer(self) -> tantivy.IndexWriter: + assert self._raw_writer is not None, ( + "WriteBatch not entered; use as context manager" + ) + return self._raw_writer + def __enter__(self) -> Self: if self._backend._path is not None: lock_path = self._backend._path / ".tantivy.lock" @@ -184,7 +190,7 @@ class WriteBatch: f"Could not acquire index lock within {self._lock_timeout}s", ) from e - self._writer = self._backend._index.writer() + self._raw_writer = self._backend._index.writer() return self def __exit__(self, exc_type, exc_val, exc_tb): @@ -194,9 +200,9 @@ class WriteBatch: self._backend._index.reload() # Explicitly delete writer to release tantivy's internal lock. # On exception the uncommitted writer is simply discarded. - if self._writer is not None: - del self._writer - self._writer = None + if self._raw_writer is not None: + del self._raw_writer + self._raw_writer = None finally: if self._lock is not None: self._lock.release() @@ -274,8 +280,18 @@ class TantivyBackend: # path=None → in-memory index (for tests) # path=some_dir → on-disk index (for production) self._path = path - self._index = None - self._schema = None + self._raw_index: tantivy.Index | None = None + self._raw_schema: tantivy.Schema | None = None + + @property + def _index(self) -> tantivy.Index: + assert self._raw_index is not None, "Index not open; call open() first" + return self._raw_index + + @property + def _schema(self) -> tantivy.Schema: + assert self._raw_schema is not None, "Schema not open; call open() first" + return self._raw_schema def open(self) -> None: """ @@ -285,14 +301,14 @@ class TantivyBackend: version or language changes. Registers custom tokenizers after opening. Safe to call multiple times - subsequent calls are no-ops. """ - if self._index is not None: + if self._raw_index is not None: return # pragma: no cover if self._path is not None: - self._index = open_or_rebuild_index(self._path) + self._raw_index = open_or_rebuild_index(self._path) else: - self._index = tantivy.Index(build_schema()) - register_tokenizers(self._index, settings.SEARCH_LANGUAGE) - self._schema = self._index.schema + self._raw_index = tantivy.Index(build_schema()) + register_tokenizers(self._raw_index, settings.SEARCH_LANGUAGE) + self._raw_schema = self._raw_index.schema def close(self) -> None: """ @@ -300,12 +316,12 @@ class TantivyBackend: Safe to call multiple times - subsequent calls are no-ops. """ - self._index = None - self._schema = None + self._raw_index = None + self._raw_schema = None def _ensure_open(self) -> None: """Ensure the index is open before operations.""" - if self._index is None: + if self._raw_index is None: self.open() # pragma: no cover def _parse_query( @@ -569,7 +585,7 @@ class TantivyBackend: batch_results = searcher.search(batch_query, limit=len(doc_ids)) result_addrs = [addr for _score, addr in batch_results.hits] - result_ids = searcher.fast_field_values("id", result_addrs) + result_ids = cast("list[int]", searcher.fast_field_values("id", result_addrs)) addr_by_id: dict[int, tuple[float, tantivy.DocAddress]] = { doc_id: (score, addr) for (score, addr), doc_id in zip(batch_results.hits, result_ids) @@ -688,7 +704,10 @@ class TantivyBackend: if threshold is not None: all_hits = [hit for hit in all_hits if hit[1] >= threshold] - return searcher.fast_field_values("id", [doc_addr for doc_addr, *_ in all_hits]) + return cast( + "list[int]", + searcher.fast_field_values("id", [doc_addr for doc_addr, *_ in all_hits]), + ) def autocomplete( self, @@ -720,11 +739,6 @@ class TantivyBackend: if not normalized_term: return [] - if TYPE_CHECKING: - assert self._index is not None - assert isinstance(self._index, Index) - assert isinstance(self._schema, Schema) - searcher = self._index.searcher() permission_query = None @@ -791,7 +805,7 @@ class TantivyBackend: results = searcher.search(final_query, limit=effective_limit + 1) addrs = [addr for _score, addr in results.hits] - all_ids = searcher.fast_field_values("id", addrs) + all_ids = cast("list[int]", searcher.fast_field_values("id", addrs)) ids = [rid for rid in all_ids if rid != doc_id] return ids[:limit] if limit is not None else ids @@ -840,9 +854,9 @@ class TantivyBackend: register_tokenizers(new_index, settings.SEARCH_LANGUAGE) # Point instance at the new index so _build_tantivy_doc uses it - old_index, old_schema = self._index, self._schema - self._index = new_index - self._schema = new_index.schema + old_index, old_schema = self._raw_index, self._raw_schema + self._raw_index = new_index + self._raw_schema = new_index.schema try: writer = new_index.writer() @@ -856,8 +870,8 @@ class TantivyBackend: new_index.reload() except BaseException: # pragma: no cover # Restore old index on failure so the backend remains usable - self._index = old_index - self._schema = old_schema + self._raw_index = old_index + self._raw_schema = old_schema raise diff --git a/uv.lock b/uv.lock index 31348aa25..14a1a7d67 100644 --- a/uv.lock +++ b/uv.lock @@ -2987,6 +2987,7 @@ typing = [ { name = "types-python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "types-pytz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "types-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "types-regex", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "types-setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, ] @@ -3125,6 +3126,7 @@ typing = [ { name = "types-python-dateutil" }, { name = "types-pytz" }, { name = "types-redis" }, + { name = "types-regex" }, { name = "types-setuptools" }, ] @@ -5185,6 +5187,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/55/82/7d25dce10aad92d2226b269bce2f85cfd843b4477cd50245d7d40ecf8f89/types_redis-4.6.0.20241004-py3-none-any.whl", hash = "sha256:ef5da68cb827e5f606c8f9c0b49eeee4c2669d6d97122f301d3a55dc6a63f6ed", size = 58737, upload-time = "2024-10-04T02:43:57.968Z" }, ] +[[package]] +name = "types-regex" +version = "2026.4.4.20260408" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/92/42/d7c691fc5a8a8ecfba3f23c1c4c087a089af0767610d88c29201193d8f60/types_regex-2026.4.4.20260408.tar.gz", hash = "sha256:86b2975ff11b06e7f538839821510daea2566d9cb18bb8acde47834315409cf9", size = 13182, upload-time = "2026-04-08T04:31:11.887Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/92/e109654a804d11d9b60d67c7b29d64b2beac6b2e3209ea075e268e5a1021/types_regex-2026.4.4.20260408-py3-none-any.whl", hash = "sha256:d436bcc409abf9b06747b7e038014afc6d40ef7b72329655c353a1955534068f", size = 11116, upload-time = "2026-04-08T04:31:11.01Z" }, +] + [[package]] name = "types-setuptools" version = "80.10.0.20260124"