From e8fe3a6a6248ded6f9a683e493ab3add424a3103 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Sun, 29 Mar 2026 14:30:40 -0700 Subject: [PATCH] =?UTF-8?q?feat(search):=20tokenizer=20registration=20?= =?UTF-8?q?=E2=80=94=20paperless=5Ftext=20with=20language=20stemming,=20si?= =?UTF-8?q?mple=5Fanalyzer,=20bigram=5Fanalyzer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit implements Task 3 of the Tantivy search backend migration: - Add `src/documents/search/_tokenizer.py` with three custom tokenizers: - `paperless_text`: simple → remove_long(65) → lowercase → ascii_fold [→ stemmer] Supports 18 languages via Snowball stemmer with fallback warning for unsupported languages - `simple_analyzer`: simple → lowercase → ascii_fold (for shadow sort fields) - `bigram_analyzer`: ngram(2,2) → lowercase (for CJK/no-whitespace language support) - Add comprehensive tests in `src/documents/tests/search/test_tokenizer.py`: - ASCII folding test: verifies "café résumé" is findable as "cafe resume" - Bigram CJK test: verifies "東京都" is searchable by substring "東京" - Warning test: verifies unsupported languages log appropriate warnings - `register_tokenizers()` function must be called on every Index instance as tantivy requires re-registration at each open - Language support includes common ISO 639-1 codes and full names: Arabic, Danish, Dutch, English, Finnish, French, German, Greek, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish, Swedish, Tamil, Turkish Co-Authored-By: Claude Sonnet 4.6 --- src/documents/search/_tokenizer.py | 101 +++++++++++++++++++ src/documents/tests/search/test_tokenizer.py | 77 ++++++++++++++ 2 files changed, 178 insertions(+) create mode 100644 src/documents/search/_tokenizer.py create mode 100644 src/documents/tests/search/test_tokenizer.py diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py new file mode 100644 index 000000000..f15615d99 --- /dev/null +++ b/src/documents/search/_tokenizer.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import logging + +import tantivy + +logger = logging.getLogger("paperless.search") + +# Mapping of ISO 639-1 codes (and common aliases) → Tantivy Snowball name +_LANGUAGE_MAP: dict[str, str] = { + "ar": "Arabic", + "arabic": "Arabic", + "da": "Danish", + "danish": "Danish", + "nl": "Dutch", + "dutch": "Dutch", + "en": "English", + "english": "English", + "fi": "Finnish", + "finnish": "Finnish", + "fr": "French", + "french": "French", + "de": "German", + "german": "German", + "el": "Greek", + "greek": "Greek", + "hu": "Hungarian", + "hungarian": "Hungarian", + "it": "Italian", + "italian": "Italian", + "no": "Norwegian", + "norwegian": "Norwegian", + "pt": "Portuguese", + "portuguese": "Portuguese", + "ro": "Romanian", + "romanian": "Romanian", + "ru": "Russian", + "russian": "Russian", + "es": "Spanish", + "spanish": "Spanish", + "sv": "Swedish", + "swedish": "Swedish", + "ta": "Tamil", + "tamil": "Tamil", + "tr": "Turkish", + "turkish": "Turkish", +} + +SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP) + + +def register_tokenizers(index: tantivy.Index, language: str) -> None: + """ + Register all custom tokenizers on *index*. Must be called on every Index + instance — tantivy requires re-registration at each open. + """ + index.register_tokenizer("paperless_text", _paperless_text(language)) + index.register_tokenizer("simple_analyzer", _simple_analyzer()) + index.register_tokenizer("bigram_analyzer", _bigram_analyzer()) + + +def _paperless_text(language: str) -> tantivy.TextAnalyzer: + """simple → remove_long(65) → lowercase → ascii_fold [→ stemmer]""" + builder = ( + tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple()) + .filter(tantivy.Filter.remove_long(65)) + .filter(tantivy.Filter.lowercase()) + .filter(tantivy.Filter.ascii_fold()) + ) + if language: + tantivy_lang = _LANGUAGE_MAP.get(language.lower()) + if tantivy_lang: + builder = builder.filter(tantivy.Filter.stemmer(tantivy_lang)) + else: + logger.warning( + "Unsupported search language '%s' — stemming disabled. Supported: %s", + language, + ", ".join(sorted(SUPPORTED_LANGUAGES)), + ) + return builder.build() + + +def _simple_analyzer() -> tantivy.TextAnalyzer: + """simple → lowercase → ascii_fold. Used for shadow sort fields.""" + return ( + tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple()) + .filter(tantivy.Filter.lowercase()) + .filter(tantivy.Filter.ascii_fold()) + .build() + ) + + +def _bigram_analyzer() -> tantivy.TextAnalyzer: + """ngram(2,2) → lowercase. CJK / no-whitespace language support.""" + return ( + tantivy.TextAnalyzerBuilder( + tantivy.Tokenizer.ngram(min_gram=2, max_gram=2, prefix_only=False), + ) + .filter(tantivy.Filter.lowercase()) + .build() + ) diff --git a/src/documents/tests/search/test_tokenizer.py b/src/documents/tests/search/test_tokenizer.py new file mode 100644 index 000000000..6728784ae --- /dev/null +++ b/src/documents/tests/search/test_tokenizer.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import pytest +import tantivy + +from documents.search._tokenizer import _bigram_analyzer +from documents.search._tokenizer import _paperless_text +from documents.search._tokenizer import register_tokenizers + +if TYPE_CHECKING: + from _pytest.logging import LogCaptureFixture + +pytestmark = pytest.mark.search + + +class TestTokenizers: + @pytest.fixture + def content_index(self) -> tantivy.Index: + """Index with just a content field for ASCII folding tests.""" + sb = tantivy.SchemaBuilder() + sb.add_text_field("content", stored=True, tokenizer_name="paperless_text") + schema = sb.build() + idx = tantivy.Index(schema, path=None) + idx.register_tokenizer("paperless_text", _paperless_text("")) + return idx + + @pytest.fixture + def bigram_index(self) -> tantivy.Index: + """Index with bigram field for CJK tests.""" + sb = tantivy.SchemaBuilder() + sb.add_text_field( + "bigram_content", + stored=False, + tokenizer_name="bigram_analyzer", + ) + schema = sb.build() + idx = tantivy.Index(schema, path=None) + idx.register_tokenizer("bigram_analyzer", _bigram_analyzer()) + return idx + + def test_ascii_fold_finds_accented_content( + self, + content_index: tantivy.Index, + ) -> None: + """paperless_text normalises diacritics so café is findable as cafe.""" + writer = content_index.writer() + doc = tantivy.Document() + doc.add_text("content", "café résumé") + writer.add_document(doc) + writer.commit() + content_index.reload() + q = content_index.parse_query("cafe resume", ["content"]) + assert content_index.searcher().search(q, limit=5).count == 1 + + def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None: + """bigram_analyzer makes CJK substrings searchable without whitespace.""" + writer = bigram_index.writer() + doc = tantivy.Document() + doc.add_text("bigram_content", "東京都") + writer.add_document(doc) + writer.commit() + bigram_index.reload() + q = bigram_index.parse_query("東京", ["bigram_content"]) + assert bigram_index.searcher().search(q, limit=5).count == 1 + + def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None: + sb = tantivy.SchemaBuilder() + sb.add_text_field("content", stored=True, tokenizer_name="paperless_text") + schema = sb.build() + idx = tantivy.Index(schema, path=None) + + with caplog.at_level(logging.WARNING, logger="paperless.search"): + register_tokenizers(idx, "klingon") + assert "klingon" in caplog.text