Enhancement: infer SEARCH_LANGUAGE from OCR_LANGUAGE; validate if explicit

- SEARCH_LANGUAGE is now str | None (None = no stemming, not "")
- When PAPERLESS_SEARCH_LANGUAGE is set, validate it against
  SUPPORTED_LANGUAGES via get_choice_from_env (startup error on bad value)
- When not set, infer from OCR_LANGUAGE's primary Tesseract code
  (eng→en, deu→de, fra→fr, etc.) covering all 18 Tantivy-supported languages
- _schema.py sentinel normalises None → "" for on-disk comparison
- _tokenizer.py type annotations updated to str | None
- docs: recommend ISO 639-1 two-letter codes; note that capitalized
  Tantivy enum names are not valid; link to Tantivy Language enum

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-30 13:37:34 -07:00
parent b10f3de2eb
commit fdf08bdc43
4 changed files with 62 additions and 9 deletions

View File

@@ -1105,12 +1105,21 @@ should be a valid crontab(5) expression describing when to run.
#### [`PAPERLESS_SEARCH_LANGUAGE=<language>`](#PAPERLESS_SEARCH_LANGUAGE) {#PAPERLESS_SEARCH_LANGUAGE}
: Sets the stemmer language for the full-text search index (e.g. `en`, `de`, `fr`).
: Sets the stemmer language for the full-text search index.
Stemming improves recall by matching word variants (e.g. "running" matches "run").
Changing this setting causes the index to be rebuilt automatically on next startup.
Supported values are the language names accepted by Tantivy's built-in stemmer.
An invalid value raises an error at startup.
Defaults to `""` (no stemming).
: Use the ISO 639-1 two-letter code (e.g. `en`, `de`, `fr`). Lowercase full names
(e.g. `english`, `german`, `french`) are also accepted. The capitalized names shown
in the [Tantivy Language enum](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html)
documentation are **not** valid — use the lowercase equivalent.
: If not set, paperless infers the language from
[`PAPERLESS_OCR_LANGUAGE`](#PAPERLESS_OCR_LANGUAGE). If the OCR language has no
Tantivy stemmer equivalent, stemming is disabled.
Defaults to unset (inferred from `PAPERLESS_OCR_LANGUAGE`).
#### [`PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD=<float>`](#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD) {#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD}

View File

@@ -94,7 +94,7 @@ def _needs_rebuild(index_dir: Path) -> bool:
if not language_file.exists():
logger.info("Search index language sentinel missing - rebuilding.")
return True
if language_file.read_text().strip() != settings.SEARCH_LANGUAGE:
if language_file.read_text().strip() != (settings.SEARCH_LANGUAGE or ""):
logger.info("Search index language changed - rebuilding.")
return True
@@ -113,7 +113,7 @@ def wipe_index(index_dir: Path) -> None:
def _write_sentinels(index_dir: Path) -> None:
"""Write schema version and language sentinel files so the next index open can skip rebuilding."""
(index_dir / ".schema_version").write_text(str(SCHEMA_VERSION))
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE)
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE or "")
def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:

View File

@@ -49,7 +49,7 @@ _LANGUAGE_MAP: dict[str, str] = {
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
def register_tokenizers(index: tantivy.Index, language: str) -> None:
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
"""
Register all custom tokenizers on *index*. Must be called on every Index
instance — tantivy requires re-registration at each open.
@@ -66,7 +66,7 @@ def register_tokenizers(index: tantivy.Index, language: str) -> None:
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
def _paperless_text(language: str) -> tantivy.TextAnalyzer:
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
builder = (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())

View File

@@ -21,6 +21,7 @@ from paperless.settings.custom import parse_hosting_settings
from paperless.settings.custom import parse_ignore_dates
from paperless.settings.custom import parse_redis_url
from paperless.settings.parsers import get_bool_from_env
from paperless.settings.parsers import get_choice_from_env
from paperless.settings.parsers import get_float_from_env
from paperless.settings.parsers import get_int_from_env
from paperless.settings.parsers import get_list_from_env
@@ -86,8 +87,6 @@ EMPTY_TRASH_DIR = (
MEDIA_LOCK = MEDIA_ROOT / "media.lock"
INDEX_DIR = DATA_DIR / "index"
SEARCH_LANGUAGE: str = os.getenv("PAPERLESS_SEARCH_LANGUAGE", "")
ADVANCED_FUZZY_SEARCH_THRESHOLD: float | None = get_float_from_env(
"PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD",
)
@@ -1040,10 +1039,55 @@ def _get_nltk_language_setting(ocr_lang: str) -> str | None:
return iso_code_to_nltk.get(ocr_lang)
def _get_search_language_setting(ocr_lang: str) -> str | None:
"""
Determine the Tantivy stemmer language.
If PAPERLESS_SEARCH_LANGUAGE is explicitly set, it is validated against
the languages supported by Tantivy's built-in stemmer and returned as-is.
Otherwise the primary Tesseract language code from PAPERLESS_OCR_LANGUAGE
is mapped to the corresponding ISO 639-1 code understood by Tantivy.
Returns None when unset and the OCR language has no Tantivy stemmer.
"""
explicit = os.environ.get("PAPERLESS_SEARCH_LANGUAGE")
if explicit is not None:
# Lazy import avoids any app-loading order concerns; _tokenizer has no
# Django dependencies so this is safe.
from documents.search._tokenizer import SUPPORTED_LANGUAGES
return get_choice_from_env("PAPERLESS_SEARCH_LANGUAGE", SUPPORTED_LANGUAGES)
# Infer from the primary Tesseract language code (ISO 639-2/T → ISO 639-1)
primary = ocr_lang.split("+", maxsplit=1)[0].lower()
_ocr_to_search: dict[str, str] = {
"ara": "ar",
"dan": "da",
"nld": "nl",
"eng": "en",
"fin": "fi",
"fra": "fr",
"deu": "de",
"ell": "el",
"hun": "hu",
"ita": "it",
"nor": "no",
"por": "pt",
"ron": "ro",
"rus": "ru",
"spa": "es",
"swe": "sv",
"tam": "ta",
"tur": "tr",
}
return _ocr_to_search.get(primary)
NLTK_ENABLED: Final[bool] = get_bool_from_env("PAPERLESS_ENABLE_NLTK", "yes")
NLTK_LANGUAGE: str | None = _get_nltk_language_setting(OCR_LANGUAGE)
SEARCH_LANGUAGE: str | None = _get_search_language_setting(OCR_LANGUAGE)
###############################################################################
# Email Preprocessors #
###############################################################################