fix(search): register fast-field tokenizer for simple_analyzer; fix perm_index fixture

Tantivy requires register_fast_field_tokenizer for any tokenizer used by fast=True text fields — it writes default fast column values on every commit even when a document omits those fields, raising ValueError otherwise. perm_index fixture simplified to use in-memory index (path=None). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-08-01 08:32:18 +00:00 · 2026-03-29 15:01:39 -07:00
parent 33da63c229
commit 957049c512
2 changed files with 11 additions and 5 deletions
@@ -52,11 +52,18 @@ SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
 def register_tokenizers(index: tantivy.Index, language: str) -> None:
    """
    Register all custom tokenizers on *index*. Must be called on every Index
-    instance - tantivy requires re-registration at each open.
+    instance — tantivy requires re-registration at each open.
+
+    simple_analyzer is also registered as a fast-field tokenizer because the
+    sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True.
+    Tantivy writes default values for fast columns on every commit, even for
+    documents that omit those fields, so the fast-field tokenizer must exist.
    """
    index.register_tokenizer("paperless_text", _paperless_text(language))
    index.register_tokenizer("simple_analyzer", _simple_analyzer())
    index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
+    # Fast-field tokenizer required for fast=True text fields in the schema
+    index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())


 def _paperless_text(language: str) -> tantivy.TextAnalyzer:
@@ -214,11 +214,10 @@ class TestPermissionFilter:
    """build_permission_filter tests use an in-memory index — no DB access needed."""

    @pytest.fixture
-    def perm_index(self, tmp_path) -> tantivy.Index:
-        # Use a temporary directory instead of in-memory index to avoid tokenizer issues
+    def perm_index(self) -> tantivy.Index:
        schema = build_schema()
-        idx = tantivy.Index(schema, path=str(tmp_path))
-        register_tokenizers(idx, "en")
+        idx = tantivy.Index(schema, path=None)
+        register_tokenizers(idx, "")
        return idx

    def _add_doc(