fix(search): register fast-field tokenizer for simple_analyzer; fix perm_index fixture

Tantivy requires register_fast_field_tokenizer for any tokenizer used by
fast=True text fields — it writes default fast column values on every commit
even when a document omits those fields, raising ValueError otherwise.

perm_index fixture simplified to use in-memory index (path=None).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-29 15:01:39 -07:00
parent 33da63c229
commit 957049c512
2 changed files with 11 additions and 5 deletions
+8 -1
View File
@@ -52,11 +52,18 @@ SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
def register_tokenizers(index: tantivy.Index, language: str) -> None:
"""
Register all custom tokenizers on *index*. Must be called on every Index
instance - tantivy requires re-registration at each open.
instance tantivy requires re-registration at each open.
simple_analyzer is also registered as a fast-field tokenizer because the
sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True.
Tantivy writes default values for fast columns on every commit, even for
documents that omit those fields, so the fast-field tokenizer must exist.
"""
index.register_tokenizer("paperless_text", _paperless_text(language))
index.register_tokenizer("simple_analyzer", _simple_analyzer())
index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
# Fast-field tokenizer required for fast=True text fields in the schema
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
def _paperless_text(language: str) -> tantivy.TextAnalyzer:
+3 -4
View File
@@ -214,11 +214,10 @@ class TestPermissionFilter:
"""build_permission_filter tests use an in-memory index — no DB access needed."""
@pytest.fixture
def perm_index(self, tmp_path) -> tantivy.Index:
# Use a temporary directory instead of in-memory index to avoid tokenizer issues
def perm_index(self) -> tantivy.Index:
schema = build_schema()
idx = tantivy.Index(schema, path=str(tmp_path))
register_tokenizers(idx, "en")
idx = tantivy.Index(schema, path=None)
register_tokenizers(idx, "")
return idx
def _add_doc(