feat(search): tokenizer registration — paperless_text with language stemming, simple_analyzer, bigram_analyzer

This commit implements Task 3 of the Tantivy search backend migration:

- Add `src/documents/search/_tokenizer.py` with three custom tokenizers:
  - `paperless_text`: simple → remove_long(65) → lowercase → ascii_fold [→ stemmer]
    Supports 18 languages via Snowball stemmer with fallback warning for unsupported languages
  - `simple_analyzer`: simple → lowercase → ascii_fold (for shadow sort fields)
  - `bigram_analyzer`: ngram(2,2) → lowercase (for CJK/no-whitespace language support)

- Add comprehensive tests in `src/documents/tests/search/test_tokenizer.py`:
  - ASCII folding test: verifies "café résumé" is findable as "cafe resume"
  - Bigram CJK test: verifies "東京都" is searchable by substring "東京"
  - Warning test: verifies unsupported languages log appropriate warnings

- `register_tokenizers()` function must be called on every Index instance
  as tantivy requires re-registration at each open

- Language support includes common ISO 639-1 codes and full names:
  Arabic, Danish, Dutch, English, Finnish, French, German, Greek,
  Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
  Spanish, Swedish, Tamil, Turkish

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-29 14:30:40 -07:00
parent 884edd6eea
commit e8fe3a6a62
2 changed files with 178 additions and 0 deletions
+101
View File
@@ -0,0 +1,101 @@
from __future__ import annotations
import logging
import tantivy
logger = logging.getLogger("paperless.search")
# Mapping of ISO 639-1 codes (and common aliases) → Tantivy Snowball name
_LANGUAGE_MAP: dict[str, str] = {
"ar": "Arabic",
"arabic": "Arabic",
"da": "Danish",
"danish": "Danish",
"nl": "Dutch",
"dutch": "Dutch",
"en": "English",
"english": "English",
"fi": "Finnish",
"finnish": "Finnish",
"fr": "French",
"french": "French",
"de": "German",
"german": "German",
"el": "Greek",
"greek": "Greek",
"hu": "Hungarian",
"hungarian": "Hungarian",
"it": "Italian",
"italian": "Italian",
"no": "Norwegian",
"norwegian": "Norwegian",
"pt": "Portuguese",
"portuguese": "Portuguese",
"ro": "Romanian",
"romanian": "Romanian",
"ru": "Russian",
"russian": "Russian",
"es": "Spanish",
"spanish": "Spanish",
"sv": "Swedish",
"swedish": "Swedish",
"ta": "Tamil",
"tamil": "Tamil",
"tr": "Turkish",
"turkish": "Turkish",
}
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
def register_tokenizers(index: tantivy.Index, language: str) -> None:
"""
Register all custom tokenizers on *index*. Must be called on every Index
instance — tantivy requires re-registration at each open.
"""
index.register_tokenizer("paperless_text", _paperless_text(language))
index.register_tokenizer("simple_analyzer", _simple_analyzer())
index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
def _paperless_text(language: str) -> tantivy.TextAnalyzer:
"""simple → remove_long(65) → lowercase → ascii_fold [→ stemmer]"""
builder = (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
.filter(tantivy.Filter.remove_long(65))
.filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold())
)
if language:
tantivy_lang = _LANGUAGE_MAP.get(language.lower())
if tantivy_lang:
builder = builder.filter(tantivy.Filter.stemmer(tantivy_lang))
else:
logger.warning(
"Unsupported search language '%s' — stemming disabled. Supported: %s",
language,
", ".join(sorted(SUPPORTED_LANGUAGES)),
)
return builder.build()
def _simple_analyzer() -> tantivy.TextAnalyzer:
"""simple → lowercase → ascii_fold. Used for shadow sort fields."""
return (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
.filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold())
.build()
)
def _bigram_analyzer() -> tantivy.TextAnalyzer:
"""ngram(2,2) → lowercase. CJK / no-whitespace language support."""
return (
tantivy.TextAnalyzerBuilder(
tantivy.Tokenizer.ngram(min_gram=2, max_gram=2, prefix_only=False),
)
.filter(tantivy.Filter.lowercase())
.build()
)
@@ -0,0 +1,77 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
import pytest
import tantivy
from documents.search._tokenizer import _bigram_analyzer
from documents.search._tokenizer import _paperless_text
from documents.search._tokenizer import register_tokenizers
if TYPE_CHECKING:
from _pytest.logging import LogCaptureFixture
pytestmark = pytest.mark.search
class TestTokenizers:
@pytest.fixture
def content_index(self) -> tantivy.Index:
"""Index with just a content field for ASCII folding tests."""
sb = tantivy.SchemaBuilder()
sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
schema = sb.build()
idx = tantivy.Index(schema, path=None)
idx.register_tokenizer("paperless_text", _paperless_text(""))
return idx
@pytest.fixture
def bigram_index(self) -> tantivy.Index:
"""Index with bigram field for CJK tests."""
sb = tantivy.SchemaBuilder()
sb.add_text_field(
"bigram_content",
stored=False,
tokenizer_name="bigram_analyzer",
)
schema = sb.build()
idx = tantivy.Index(schema, path=None)
idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
return idx
def test_ascii_fold_finds_accented_content(
self,
content_index: tantivy.Index,
) -> None:
"""paperless_text normalises diacritics so café is findable as cafe."""
writer = content_index.writer()
doc = tantivy.Document()
doc.add_text("content", "café résumé")
writer.add_document(doc)
writer.commit()
content_index.reload()
q = content_index.parse_query("cafe resume", ["content"])
assert content_index.searcher().search(q, limit=5).count == 1
def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None:
"""bigram_analyzer makes CJK substrings searchable without whitespace."""
writer = bigram_index.writer()
doc = tantivy.Document()
doc.add_text("bigram_content", "東京都")
writer.add_document(doc)
writer.commit()
bigram_index.reload()
q = bigram_index.parse_query("東京", ["bigram_content"])
assert bigram_index.searcher().search(q, limit=5).count == 1
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
sb = tantivy.SchemaBuilder()
sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
schema = sb.build()
idx = tantivy.Index(schema, path=None)
with caplog.at_level(logging.WARNING, logger="paperless.search"):
register_tokenizers(idx, "klingon")
assert "klingon" in caplog.text