mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-06 21:59:46 +00:00
fix(search): register fast-field tokenizer for simple_analyzer; fix perm_index fixture
Tantivy requires register_fast_field_tokenizer for any tokenizer used by fast=True text fields — it writes default fast column values on every commit even when a document omits those fields, raising ValueError otherwise. perm_index fixture simplified to use in-memory index (path=None). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -52,11 +52,18 @@ SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
||||
def register_tokenizers(index: tantivy.Index, language: str) -> None:
|
||||
"""
|
||||
Register all custom tokenizers on *index*. Must be called on every Index
|
||||
instance - tantivy requires re-registration at each open.
|
||||
instance — tantivy requires re-registration at each open.
|
||||
|
||||
simple_analyzer is also registered as a fast-field tokenizer because the
|
||||
sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True.
|
||||
Tantivy writes default values for fast columns on every commit, even for
|
||||
documents that omit those fields, so the fast-field tokenizer must exist.
|
||||
"""
|
||||
index.register_tokenizer("paperless_text", _paperless_text(language))
|
||||
index.register_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
||||
# Fast-field tokenizer required for fast=True text fields in the schema
|
||||
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
|
||||
|
||||
def _paperless_text(language: str) -> tantivy.TextAnalyzer:
|
||||
|
||||
@@ -214,11 +214,10 @@ class TestPermissionFilter:
|
||||
"""build_permission_filter tests use an in-memory index — no DB access needed."""
|
||||
|
||||
@pytest.fixture
|
||||
def perm_index(self, tmp_path) -> tantivy.Index:
|
||||
# Use a temporary directory instead of in-memory index to avoid tokenizer issues
|
||||
def perm_index(self) -> tantivy.Index:
|
||||
schema = build_schema()
|
||||
idx = tantivy.Index(schema, path=str(tmp_path))
|
||||
register_tokenizers(idx, "en")
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
register_tokenizers(idx, "")
|
||||
return idx
|
||||
|
||||
def _add_doc(
|
||||
|
||||
Reference in New Issue
Block a user