mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-07-01 09:44:19 +00:00
feat(search): tokenizer registration — paperless_text with language stemming, simple_analyzer, bigram_analyzer
This commit implements Task 3 of the Tantivy search backend migration:
- Add `src/documents/search/_tokenizer.py` with three custom tokenizers:
- `paperless_text`: simple → remove_long(65) → lowercase → ascii_fold [→ stemmer]
Supports 18 languages via Snowball stemmer with fallback warning for unsupported languages
- `simple_analyzer`: simple → lowercase → ascii_fold (for shadow sort fields)
- `bigram_analyzer`: ngram(2,2) → lowercase (for CJK/no-whitespace language support)
- Add comprehensive tests in `src/documents/tests/search/test_tokenizer.py`:
- ASCII folding test: verifies "café résumé" is findable as "cafe resume"
- Bigram CJK test: verifies "東京都" is searchable by substring "東京"
- Warning test: verifies unsupported languages log appropriate warnings
- `register_tokenizers()` function must be called on every Index instance
as tantivy requires re-registration at each open
- Language support includes common ISO 639-1 codes and full names:
Arabic, Danish, Dutch, English, Finnish, French, German, Greek,
Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
Spanish, Swedish, Tamil, Turkish
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,101 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import tantivy
|
||||
|
||||
logger = logging.getLogger("paperless.search")
|
||||
|
||||
# Mapping of ISO 639-1 codes (and common aliases) → Tantivy Snowball name
|
||||
_LANGUAGE_MAP: dict[str, str] = {
|
||||
"ar": "Arabic",
|
||||
"arabic": "Arabic",
|
||||
"da": "Danish",
|
||||
"danish": "Danish",
|
||||
"nl": "Dutch",
|
||||
"dutch": "Dutch",
|
||||
"en": "English",
|
||||
"english": "English",
|
||||
"fi": "Finnish",
|
||||
"finnish": "Finnish",
|
||||
"fr": "French",
|
||||
"french": "French",
|
||||
"de": "German",
|
||||
"german": "German",
|
||||
"el": "Greek",
|
||||
"greek": "Greek",
|
||||
"hu": "Hungarian",
|
||||
"hungarian": "Hungarian",
|
||||
"it": "Italian",
|
||||
"italian": "Italian",
|
||||
"no": "Norwegian",
|
||||
"norwegian": "Norwegian",
|
||||
"pt": "Portuguese",
|
||||
"portuguese": "Portuguese",
|
||||
"ro": "Romanian",
|
||||
"romanian": "Romanian",
|
||||
"ru": "Russian",
|
||||
"russian": "Russian",
|
||||
"es": "Spanish",
|
||||
"spanish": "Spanish",
|
||||
"sv": "Swedish",
|
||||
"swedish": "Swedish",
|
||||
"ta": "Tamil",
|
||||
"tamil": "Tamil",
|
||||
"tr": "Turkish",
|
||||
"turkish": "Turkish",
|
||||
}
|
||||
|
||||
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
||||
|
||||
|
||||
def register_tokenizers(index: tantivy.Index, language: str) -> None:
|
||||
"""
|
||||
Register all custom tokenizers on *index*. Must be called on every Index
|
||||
instance — tantivy requires re-registration at each open.
|
||||
"""
|
||||
index.register_tokenizer("paperless_text", _paperless_text(language))
|
||||
index.register_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
||||
|
||||
|
||||
def _paperless_text(language: str) -> tantivy.TextAnalyzer:
|
||||
"""simple → remove_long(65) → lowercase → ascii_fold [→ stemmer]"""
|
||||
builder = (
|
||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
||||
.filter(tantivy.Filter.remove_long(65))
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.filter(tantivy.Filter.ascii_fold())
|
||||
)
|
||||
if language:
|
||||
tantivy_lang = _LANGUAGE_MAP.get(language.lower())
|
||||
if tantivy_lang:
|
||||
builder = builder.filter(tantivy.Filter.stemmer(tantivy_lang))
|
||||
else:
|
||||
logger.warning(
|
||||
"Unsupported search language '%s' — stemming disabled. Supported: %s",
|
||||
language,
|
||||
", ".join(sorted(SUPPORTED_LANGUAGES)),
|
||||
)
|
||||
return builder.build()
|
||||
|
||||
|
||||
def _simple_analyzer() -> tantivy.TextAnalyzer:
|
||||
"""simple → lowercase → ascii_fold. Used for shadow sort fields."""
|
||||
return (
|
||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.filter(tantivy.Filter.ascii_fold())
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def _bigram_analyzer() -> tantivy.TextAnalyzer:
|
||||
"""ngram(2,2) → lowercase. CJK / no-whitespace language support."""
|
||||
return (
|
||||
tantivy.TextAnalyzerBuilder(
|
||||
tantivy.Tokenizer.ngram(min_gram=2, max_gram=2, prefix_only=False),
|
||||
)
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.build()
|
||||
)
|
||||
@@ -0,0 +1,77 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
import tantivy
|
||||
|
||||
from documents.search._tokenizer import _bigram_analyzer
|
||||
from documents.search._tokenizer import _paperless_text
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _pytest.logging import LogCaptureFixture
|
||||
|
||||
pytestmark = pytest.mark.search
|
||||
|
||||
|
||||
class TestTokenizers:
|
||||
@pytest.fixture
|
||||
def content_index(self) -> tantivy.Index:
|
||||
"""Index with just a content field for ASCII folding tests."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
|
||||
schema = sb.build()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
idx.register_tokenizer("paperless_text", _paperless_text(""))
|
||||
return idx
|
||||
|
||||
@pytest.fixture
|
||||
def bigram_index(self) -> tantivy.Index:
|
||||
"""Index with bigram field for CJK tests."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
sb.add_text_field(
|
||||
"bigram_content",
|
||||
stored=False,
|
||||
tokenizer_name="bigram_analyzer",
|
||||
)
|
||||
schema = sb.build()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
||||
return idx
|
||||
|
||||
def test_ascii_fold_finds_accented_content(
|
||||
self,
|
||||
content_index: tantivy.Index,
|
||||
) -> None:
|
||||
"""paperless_text normalises diacritics so café is findable as cafe."""
|
||||
writer = content_index.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_text("content", "café résumé")
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
content_index.reload()
|
||||
q = content_index.parse_query("cafe resume", ["content"])
|
||||
assert content_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None:
|
||||
"""bigram_analyzer makes CJK substrings searchable without whitespace."""
|
||||
writer = bigram_index.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_text("bigram_content", "東京都")
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
bigram_index.reload()
|
||||
q = bigram_index.parse_query("東京", ["bigram_content"])
|
||||
assert bigram_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
||||
sb = tantivy.SchemaBuilder()
|
||||
sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
|
||||
schema = sb.build()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
|
||||
with caplog.at_level(logging.WARNING, logger="paperless.search"):
|
||||
register_tokenizers(idx, "klingon")
|
||||
assert "klingon" in caplog.text
|
||||
Reference in New Issue
Block a user