From 957049c512ca3ca2a39a251e2569456cb0194b44 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Sun, 29 Mar 2026 15:01:39 -0700 Subject: [PATCH] fix(search): register fast-field tokenizer for simple_analyzer; fix perm_index fixture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tantivy requires register_fast_field_tokenizer for any tokenizer used by fast=True text fields — it writes default fast column values on every commit even when a document omits those fields, raising ValueError otherwise. perm_index fixture simplified to use in-memory index (path=None). Co-Authored-By: Claude Sonnet 4.6 --- src/documents/search/_tokenizer.py | 9 ++++++++- src/documents/tests/search/test_query.py | 7 +++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py index 1bcd91113..5f656f585 100644 --- a/src/documents/search/_tokenizer.py +++ b/src/documents/search/_tokenizer.py @@ -52,11 +52,18 @@ SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP) def register_tokenizers(index: tantivy.Index, language: str) -> None: """ Register all custom tokenizers on *index*. Must be called on every Index - instance - tantivy requires re-registration at each open. + instance — tantivy requires re-registration at each open. + + simple_analyzer is also registered as a fast-field tokenizer because the + sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True. + Tantivy writes default values for fast columns on every commit, even for + documents that omit those fields, so the fast-field tokenizer must exist. """ index.register_tokenizer("paperless_text", _paperless_text(language)) index.register_tokenizer("simple_analyzer", _simple_analyzer()) index.register_tokenizer("bigram_analyzer", _bigram_analyzer()) + # Fast-field tokenizer required for fast=True text fields in the schema + index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer()) def _paperless_text(language: str) -> tantivy.TextAnalyzer: diff --git a/src/documents/tests/search/test_query.py b/src/documents/tests/search/test_query.py index 775aa991b..3c6aa236f 100644 --- a/src/documents/tests/search/test_query.py +++ b/src/documents/tests/search/test_query.py @@ -214,11 +214,10 @@ class TestPermissionFilter: """build_permission_filter tests use an in-memory index — no DB access needed.""" @pytest.fixture - def perm_index(self, tmp_path) -> tantivy.Index: - # Use a temporary directory instead of in-memory index to avoid tokenizer issues + def perm_index(self) -> tantivy.Index: schema = build_schema() - idx = tantivy.Index(schema, path=str(tmp_path)) - register_tokenizers(idx, "en") + idx = tantivy.Index(schema, path=None) + register_tokenizers(idx, "") return idx def _add_doc(