From e8fe3a6a6248ded6f9a683e493ab3add424a3103 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Sun, 29 Mar 2026 14:30:40 -0700
Subject: [PATCH] =?UTF-8?q?feat(search):=20tokenizer=20registration=20?=
 =?UTF-8?q?=E2=80=94=20paperless=5Ftext=20with=20language=20stemming,=20si?=
 =?UTF-8?q?mple=5Fanalyzer,=20bigram=5Fanalyzer?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit implements Task 3 of the Tantivy search backend migration:

- Add `src/documents/search/_tokenizer.py` with three custom tokenizers:
  - `paperless_text`: simple → remove_long(65) → lowercase → ascii_fold [→ stemmer]
    Supports 18 languages via Snowball stemmer with fallback warning for unsupported languages
  - `simple_analyzer`: simple → lowercase → ascii_fold (for shadow sort fields)
  - `bigram_analyzer`: ngram(2,2) → lowercase (for CJK/no-whitespace language support)

- Add comprehensive tests in `src/documents/tests/search/test_tokenizer.py`:
  - ASCII folding test: verifies "café résumé" is findable as "cafe resume"
  - Bigram CJK test: verifies "東京都" is searchable by substring "東京"
  - Warning test: verifies unsupported languages log appropriate warnings

- `register_tokenizers()` function must be called on every Index instance
  as tantivy requires re-registration at each open

- Language support includes common ISO 639-1 codes and full names:
  Arabic, Danish, Dutch, English, Finnish, French, German, Greek,
  Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
  Spanish, Swedish, Tamil, Turkish

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/documents/search/_tokenizer.py           | 101 +++++++++++++++++++
 src/documents/tests/search/test_tokenizer.py |  77 ++++++++++++++
 2 files changed, 178 insertions(+)
 create mode 100644 src/documents/search/_tokenizer.py
 create mode 100644 src/documents/tests/search/test_tokenizer.py

diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py
new file mode 100644
index 000000000..f15615d99
--- /dev/null
+++ b/src/documents/search/_tokenizer.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import logging
+
+import tantivy
+
+logger = logging.getLogger("paperless.search")
+
+# Mapping of ISO 639-1 codes (and common aliases) → Tantivy Snowball name
+_LANGUAGE_MAP: dict[str, str] = {
+    "ar": "Arabic",
+    "arabic": "Arabic",
+    "da": "Danish",
+    "danish": "Danish",
+    "nl": "Dutch",
+    "dutch": "Dutch",
+    "en": "English",
+    "english": "English",
+    "fi": "Finnish",
+    "finnish": "Finnish",
+    "fr": "French",
+    "french": "French",
+    "de": "German",
+    "german": "German",
+    "el": "Greek",
+    "greek": "Greek",
+    "hu": "Hungarian",
+    "hungarian": "Hungarian",
+    "it": "Italian",
+    "italian": "Italian",
+    "no": "Norwegian",
+    "norwegian": "Norwegian",
+    "pt": "Portuguese",
+    "portuguese": "Portuguese",
+    "ro": "Romanian",
+    "romanian": "Romanian",
+    "ru": "Russian",
+    "russian": "Russian",
+    "es": "Spanish",
+    "spanish": "Spanish",
+    "sv": "Swedish",
+    "swedish": "Swedish",
+    "ta": "Tamil",
+    "tamil": "Tamil",
+    "tr": "Turkish",
+    "turkish": "Turkish",
+}
+
+SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
+
+
+def register_tokenizers(index: tantivy.Index, language: str) -> None:
+    """
+    Register all custom tokenizers on *index*. Must be called on every Index
+    instance — tantivy requires re-registration at each open.
+    """
+    index.register_tokenizer("paperless_text", _paperless_text(language))
+    index.register_tokenizer("simple_analyzer", _simple_analyzer())
+    index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
+
+
+def _paperless_text(language: str) -> tantivy.TextAnalyzer:
+    """simple → remove_long(65) → lowercase → ascii_fold [→ stemmer]"""
+    builder = (
+        tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
+        .filter(tantivy.Filter.remove_long(65))
+        .filter(tantivy.Filter.lowercase())
+        .filter(tantivy.Filter.ascii_fold())
+    )
+    if language:
+        tantivy_lang = _LANGUAGE_MAP.get(language.lower())
+        if tantivy_lang:
+            builder = builder.filter(tantivy.Filter.stemmer(tantivy_lang))
+        else:
+            logger.warning(
+                "Unsupported search language '%s' — stemming disabled. Supported: %s",
+                language,
+                ", ".join(sorted(SUPPORTED_LANGUAGES)),
+            )
+    return builder.build()
+
+
+def _simple_analyzer() -> tantivy.TextAnalyzer:
+    """simple → lowercase → ascii_fold. Used for shadow sort fields."""
+    return (
+        tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
+        .filter(tantivy.Filter.lowercase())
+        .filter(tantivy.Filter.ascii_fold())
+        .build()
+    )
+
+
+def _bigram_analyzer() -> tantivy.TextAnalyzer:
+    """ngram(2,2) → lowercase. CJK / no-whitespace language support."""
+    return (
+        tantivy.TextAnalyzerBuilder(
+            tantivy.Tokenizer.ngram(min_gram=2, max_gram=2, prefix_only=False),
+        )
+        .filter(tantivy.Filter.lowercase())
+        .build()
+    )
diff --git a/src/documents/tests/search/test_tokenizer.py b/src/documents/tests/search/test_tokenizer.py
new file mode 100644
index 000000000..6728784ae
--- /dev/null
+++ b/src/documents/tests/search/test_tokenizer.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import pytest
+import tantivy
+
+from documents.search._tokenizer import _bigram_analyzer
+from documents.search._tokenizer import _paperless_text
+from documents.search._tokenizer import register_tokenizers
+
+if TYPE_CHECKING:
+    from _pytest.logging import LogCaptureFixture
+
+pytestmark = pytest.mark.search
+
+
+class TestTokenizers:
+    @pytest.fixture
+    def content_index(self) -> tantivy.Index:
+        """Index with just a content field for ASCII folding tests."""
+        sb = tantivy.SchemaBuilder()
+        sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
+        schema = sb.build()
+        idx = tantivy.Index(schema, path=None)
+        idx.register_tokenizer("paperless_text", _paperless_text(""))
+        return idx
+
+    @pytest.fixture
+    def bigram_index(self) -> tantivy.Index:
+        """Index with bigram field for CJK tests."""
+        sb = tantivy.SchemaBuilder()
+        sb.add_text_field(
+            "bigram_content",
+            stored=False,
+            tokenizer_name="bigram_analyzer",
+        )
+        schema = sb.build()
+        idx = tantivy.Index(schema, path=None)
+        idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
+        return idx
+
+    def test_ascii_fold_finds_accented_content(
+        self,
+        content_index: tantivy.Index,
+    ) -> None:
+        """paperless_text normalises diacritics so café is findable as cafe."""
+        writer = content_index.writer()
+        doc = tantivy.Document()
+        doc.add_text("content", "café résumé")
+        writer.add_document(doc)
+        writer.commit()
+        content_index.reload()
+        q = content_index.parse_query("cafe resume", ["content"])
+        assert content_index.searcher().search(q, limit=5).count == 1
+
+    def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None:
+        """bigram_analyzer makes CJK substrings searchable without whitespace."""
+        writer = bigram_index.writer()
+        doc = tantivy.Document()
+        doc.add_text("bigram_content", "東京都")
+        writer.add_document(doc)
+        writer.commit()
+        bigram_index.reload()
+        q = bigram_index.parse_query("東京", ["bigram_content"])
+        assert bigram_index.searcher().search(q, limit=5).count == 1
+
+    def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
+        sb = tantivy.SchemaBuilder()
+        sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
+        schema = sb.build()
+        idx = tantivy.Index(schema, path=None)
+
+        with caplog.at_level(logging.WARNING, logger="paperless.search"):
+            register_tokenizers(idx, "klingon")
+        assert "klingon" in caplog.text