Files
paperless-ngx/src/documents/search/_tokenizer.py
T

165 lines
5.8 KiB
Python

from __future__ import annotations
import logging
from typing import Final
import tantivy
logger = logging.getLogger("paperless.search")
# Mapping of ISO 639-1 codes (and common aliases) -> Tantivy Snowball name
_LANGUAGE_MAP: dict[str, str] = {
"ar": "Arabic",
"arabic": "Arabic",
"da": "Danish",
"danish": "Danish",
"nl": "Dutch",
"dutch": "Dutch",
"en": "English",
"english": "English",
"fi": "Finnish",
"finnish": "Finnish",
"fr": "French",
"french": "French",
"de": "German",
"german": "German",
"el": "Greek",
"greek": "Greek",
"hu": "Hungarian",
"hungarian": "Hungarian",
"it": "Italian",
"italian": "Italian",
"no": "Norwegian",
"norwegian": "Norwegian",
"pt": "Portuguese",
"portuguese": "Portuguese",
"ro": "Romanian",
"romanian": "Romanian",
"ru": "Russian",
"russian": "Russian",
"es": "Spanish",
"spanish": "Spanish",
"sv": "Swedish",
"swedish": "Swedish",
"ta": "Tamil",
"tamil": "Tamil",
"tr": "Turkish",
"turkish": "Turkish",
}
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
"""
Register all custom tokenizers required by the paperless schema.
Must be called on every Index instance since Tantivy requires tokenizer
re-registration after each index open/creation. Registers tokenizers for
full-text search, sorting, CJK language support, and fast-field indexing.
Args:
index: Tantivy index instance to register tokenizers on
language: ISO 639-1 language code for stemming (None to disable)
Note:
simple_analyzer is registered as both a text and fast-field tokenizer
since sort shadow fields (title_sort, correspondent_sort, type_sort)
use fast=True and Tantivy requires fast-field tokenizers to exist
even for documents that omit those fields.
"""
index.register_tokenizer("paperless_text", _paperless_text(language))
index.register_tokenizer("simple_analyzer", _simple_analyzer())
index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
index.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
# Fast-field tokenizer required for fast=True text fields in the schema
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
builder = (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
.filter(tantivy.Filter.remove_long(65))
.filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold())
)
if language:
tantivy_lang = _LANGUAGE_MAP.get(language.lower())
if tantivy_lang:
builder = builder.filter(tantivy.Filter.stemmer(tantivy_lang))
else:
logger.warning(
"Unsupported search language '%s' - stemming disabled. Supported: %s",
language,
", ".join(sorted(SUPPORTED_LANGUAGES)),
)
return builder.build()
def _simple_analyzer() -> tantivy.TextAnalyzer:
"""Tokenizer for shadow sort fields (title_sort, correspondent_sort, type_sort): simple -> lowercase -> ascii_fold."""
return (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
.filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold())
.build()
)
def _bigram_analyzer() -> tantivy.TextAnalyzer:
"""Enables substring search in CJK text: ngram(2,2) -> lowercase. CJK / no-whitespace language support."""
return (
tantivy.TextAnalyzerBuilder(
tantivy.Tokenizer.ngram(min_gram=2, max_gram=2, prefix_only=False),
)
.filter(tantivy.Filter.lowercase())
.build()
)
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(65) -> lowercase -> ascii_fold."""
return (
tantivy.TextAnalyzerBuilder(
tantivy.Tokenizer.regex(r"\S+"),
)
.filter(tantivy.Filter.remove_long(65))
.filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold())
.build()
)
# Shared analyzers for query-side normalization. They reuse the exact filters
# applied at index time so query terms fold identically (single source of truth
# for ASCII folding, instead of a separate Python implementation). tantivy-py's
# TextAnalyzer.analyze clones internally per call, so these are safe to share.
_SIMPLE_SEARCH_ANALYZER: Final = _simple_search_analyzer()
# raw tokenizer keeps the whole input as one token, so this folds an arbitrary
# string to ASCII exactly like the content tokenizers (ß->ss, ø->o, æ->ae, ...)
# without splitting it - used for autocomplete words and prefixes.
_ASCII_FOLD_ANALYZER: Final = (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.raw())
.filter(tantivy.Filter.ascii_fold())
.build()
)
def simple_search_tokens(text: str) -> list[str]:
"""Tokenize a query string exactly as simple_title/simple_content are indexed."""
return _SIMPLE_SEARCH_ANALYZER.analyze(text)
def ascii_fold(text: str) -> str:
"""Fold text to ASCII using the same mapping as the content tokenizers.
Maps non-decomposable letters (ß->ss, ø->o, æ->ae, ...) identically to
Tantivy's ascii_fold filter used at index time, so query/autocomplete terms
agree with the folded content. A naive NFD strip would instead delete those
letters, causing silent search misses. Callers lowercase first, matching the
index pipeline's lowercase -> ascii_fold order.
"""
tokens = _ASCII_FOLD_ANALYZER.analyze(text)
return tokens[0] if tokens else ""