Enhancement: infer SEARCH_LANGUAGE from OCR_LANGUAGE; validate if explicit

- SEARCH_LANGUAGE is now str | None (None = no stemming, not "")
- When PAPERLESS_SEARCH_LANGUAGE is set, validate it against
  SUPPORTED_LANGUAGES via get_choice_from_env (startup error on bad value)
- When not set, infer from OCR_LANGUAGE's primary Tesseract code
  (eng→en, deu→de, fra→fr, etc.) covering all 18 Tantivy-supported languages
- _schema.py sentinel normalises None → "" for on-disk comparison
- _tokenizer.py type annotations updated to str | None
- docs: recommend ISO 639-1 two-letter codes; note that capitalized
  Tantivy enum names are not valid; link to Tantivy Language enum

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-30 13:37:34 -07:00
parent b10f3de2eb
commit fdf08bdc43
4 changed files with 62 additions and 9 deletions
+2 -2
View File
@@ -94,7 +94,7 @@ def _needs_rebuild(index_dir: Path) -> bool:
if not language_file.exists():
logger.info("Search index language sentinel missing - rebuilding.")
return True
if language_file.read_text().strip() != settings.SEARCH_LANGUAGE:
if language_file.read_text().strip() != (settings.SEARCH_LANGUAGE or ""):
logger.info("Search index language changed - rebuilding.")
return True
@@ -113,7 +113,7 @@ def wipe_index(index_dir: Path) -> None:
def _write_sentinels(index_dir: Path) -> None:
"""Write schema version and language sentinel files so the next index open can skip rebuilding."""
(index_dir / ".schema_version").write_text(str(SCHEMA_VERSION))
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE)
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE or "")
def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
+2 -2
View File
@@ -49,7 +49,7 @@ _LANGUAGE_MAP: dict[str, str] = {
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
def register_tokenizers(index: tantivy.Index, language: str) -> None:
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
"""
Register all custom tokenizers on *index*. Must be called on every Index
instance — tantivy requires re-registration at each open.
@@ -66,7 +66,7 @@ def register_tokenizers(index: tantivy.Index, language: str) -> None:
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
def _paperless_text(language: str) -> tantivy.TextAnalyzer:
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
builder = (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())