diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py index 1bcd91113..5f656f585 100644 --- a/src/documents/search/_tokenizer.py +++ b/src/documents/search/_tokenizer.py @@ -52,11 +52,18 @@ SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP) def register_tokenizers(index: tantivy.Index, language: str) -> None: """ Register all custom tokenizers on *index*. Must be called on every Index - instance - tantivy requires re-registration at each open. + instance — tantivy requires re-registration at each open. + + simple_analyzer is also registered as a fast-field tokenizer because the + sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True. + Tantivy writes default values for fast columns on every commit, even for + documents that omit those fields, so the fast-field tokenizer must exist. """ index.register_tokenizer("paperless_text", _paperless_text(language)) index.register_tokenizer("simple_analyzer", _simple_analyzer()) index.register_tokenizer("bigram_analyzer", _bigram_analyzer()) + # Fast-field tokenizer required for fast=True text fields in the schema + index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer()) def _paperless_text(language: str) -> tantivy.TextAnalyzer: diff --git a/src/documents/tests/search/test_query.py b/src/documents/tests/search/test_query.py index 775aa991b..3c6aa236f 100644 --- a/src/documents/tests/search/test_query.py +++ b/src/documents/tests/search/test_query.py @@ -214,11 +214,10 @@ class TestPermissionFilter: """build_permission_filter tests use an in-memory index — no DB access needed.""" @pytest.fixture - def perm_index(self, tmp_path) -> tantivy.Index: - # Use a temporary directory instead of in-memory index to avoid tokenizer issues + def perm_index(self) -> tantivy.Index: schema = build_schema() - idx = tantivy.Index(schema, path=str(tmp_path)) - register_tokenizers(idx, "en") + idx = tantivy.Index(schema, path=None) + register_tokenizers(idx, "") return idx def _add_doc(