mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-30 21:02:45 +00:00
Enhancement: infer SEARCH_LANGUAGE from OCR_LANGUAGE; validate if explicit
- SEARCH_LANGUAGE is now str | None (None = no stemming, not "") - When PAPERLESS_SEARCH_LANGUAGE is set, validate it against SUPPORTED_LANGUAGES via get_choice_from_env (startup error on bad value) - When not set, infer from OCR_LANGUAGE's primary Tesseract code (eng→en, deu→de, fra→fr, etc.) covering all 18 Tantivy-supported languages - _schema.py sentinel normalises None → "" for on-disk comparison - _tokenizer.py type annotations updated to str | None - docs: recommend ISO 639-1 two-letter codes; note that capitalized Tantivy enum names are not valid; link to Tantivy Language enum Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1105,12 +1105,21 @@ should be a valid crontab(5) expression describing when to run.
|
||||
|
||||
#### [`PAPERLESS_SEARCH_LANGUAGE=<language>`](#PAPERLESS_SEARCH_LANGUAGE) {#PAPERLESS_SEARCH_LANGUAGE}
|
||||
|
||||
: Sets the stemmer language for the full-text search index (e.g. `en`, `de`, `fr`).
|
||||
: Sets the stemmer language for the full-text search index.
|
||||
Stemming improves recall by matching word variants (e.g. "running" matches "run").
|
||||
Changing this setting causes the index to be rebuilt automatically on next startup.
|
||||
Supported values are the language names accepted by Tantivy's built-in stemmer.
|
||||
An invalid value raises an error at startup.
|
||||
|
||||
Defaults to `""` (no stemming).
|
||||
: Use the ISO 639-1 two-letter code (e.g. `en`, `de`, `fr`). Lowercase full names
|
||||
(e.g. `english`, `german`, `french`) are also accepted. The capitalized names shown
|
||||
in the [Tantivy Language enum](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html)
|
||||
documentation are **not** valid — use the lowercase equivalent.
|
||||
|
||||
: If not set, paperless infers the language from
|
||||
[`PAPERLESS_OCR_LANGUAGE`](#PAPERLESS_OCR_LANGUAGE). If the OCR language has no
|
||||
Tantivy stemmer equivalent, stemming is disabled.
|
||||
|
||||
Defaults to unset (inferred from `PAPERLESS_OCR_LANGUAGE`).
|
||||
|
||||
#### [`PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD=<float>`](#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD) {#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD}
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ def _needs_rebuild(index_dir: Path) -> bool:
|
||||
if not language_file.exists():
|
||||
logger.info("Search index language sentinel missing - rebuilding.")
|
||||
return True
|
||||
if language_file.read_text().strip() != settings.SEARCH_LANGUAGE:
|
||||
if language_file.read_text().strip() != (settings.SEARCH_LANGUAGE or ""):
|
||||
logger.info("Search index language changed - rebuilding.")
|
||||
return True
|
||||
|
||||
@@ -113,7 +113,7 @@ def wipe_index(index_dir: Path) -> None:
|
||||
def _write_sentinels(index_dir: Path) -> None:
|
||||
"""Write schema version and language sentinel files so the next index open can skip rebuilding."""
|
||||
(index_dir / ".schema_version").write_text(str(SCHEMA_VERSION))
|
||||
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE)
|
||||
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE or "")
|
||||
|
||||
|
||||
def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
|
||||
|
||||
@@ -49,7 +49,7 @@ _LANGUAGE_MAP: dict[str, str] = {
|
||||
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
||||
|
||||
|
||||
def register_tokenizers(index: tantivy.Index, language: str) -> None:
|
||||
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||
"""
|
||||
Register all custom tokenizers on *index*. Must be called on every Index
|
||||
instance — tantivy requires re-registration at each open.
|
||||
@@ -66,7 +66,7 @@ def register_tokenizers(index: tantivy.Index, language: str) -> None:
|
||||
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
|
||||
|
||||
def _paperless_text(language: str) -> tantivy.TextAnalyzer:
|
||||
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
|
||||
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
|
||||
builder = (
|
||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
||||
|
||||
@@ -21,6 +21,7 @@ from paperless.settings.custom import parse_hosting_settings
|
||||
from paperless.settings.custom import parse_ignore_dates
|
||||
from paperless.settings.custom import parse_redis_url
|
||||
from paperless.settings.parsers import get_bool_from_env
|
||||
from paperless.settings.parsers import get_choice_from_env
|
||||
from paperless.settings.parsers import get_float_from_env
|
||||
from paperless.settings.parsers import get_int_from_env
|
||||
from paperless.settings.parsers import get_list_from_env
|
||||
@@ -86,8 +87,6 @@ EMPTY_TRASH_DIR = (
|
||||
MEDIA_LOCK = MEDIA_ROOT / "media.lock"
|
||||
INDEX_DIR = DATA_DIR / "index"
|
||||
|
||||
SEARCH_LANGUAGE: str = os.getenv("PAPERLESS_SEARCH_LANGUAGE", "")
|
||||
|
||||
ADVANCED_FUZZY_SEARCH_THRESHOLD: float | None = get_float_from_env(
|
||||
"PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD",
|
||||
)
|
||||
@@ -1040,10 +1039,55 @@ def _get_nltk_language_setting(ocr_lang: str) -> str | None:
|
||||
return iso_code_to_nltk.get(ocr_lang)
|
||||
|
||||
|
||||
def _get_search_language_setting(ocr_lang: str) -> str | None:
|
||||
"""
|
||||
Determine the Tantivy stemmer language.
|
||||
|
||||
If PAPERLESS_SEARCH_LANGUAGE is explicitly set, it is validated against
|
||||
the languages supported by Tantivy's built-in stemmer and returned as-is.
|
||||
Otherwise the primary Tesseract language code from PAPERLESS_OCR_LANGUAGE
|
||||
is mapped to the corresponding ISO 639-1 code understood by Tantivy.
|
||||
Returns None when unset and the OCR language has no Tantivy stemmer.
|
||||
"""
|
||||
explicit = os.environ.get("PAPERLESS_SEARCH_LANGUAGE")
|
||||
if explicit is not None:
|
||||
# Lazy import avoids any app-loading order concerns; _tokenizer has no
|
||||
# Django dependencies so this is safe.
|
||||
from documents.search._tokenizer import SUPPORTED_LANGUAGES
|
||||
|
||||
return get_choice_from_env("PAPERLESS_SEARCH_LANGUAGE", SUPPORTED_LANGUAGES)
|
||||
|
||||
# Infer from the primary Tesseract language code (ISO 639-2/T → ISO 639-1)
|
||||
primary = ocr_lang.split("+", maxsplit=1)[0].lower()
|
||||
_ocr_to_search: dict[str, str] = {
|
||||
"ara": "ar",
|
||||
"dan": "da",
|
||||
"nld": "nl",
|
||||
"eng": "en",
|
||||
"fin": "fi",
|
||||
"fra": "fr",
|
||||
"deu": "de",
|
||||
"ell": "el",
|
||||
"hun": "hu",
|
||||
"ita": "it",
|
||||
"nor": "no",
|
||||
"por": "pt",
|
||||
"ron": "ro",
|
||||
"rus": "ru",
|
||||
"spa": "es",
|
||||
"swe": "sv",
|
||||
"tam": "ta",
|
||||
"tur": "tr",
|
||||
}
|
||||
return _ocr_to_search.get(primary)
|
||||
|
||||
|
||||
NLTK_ENABLED: Final[bool] = get_bool_from_env("PAPERLESS_ENABLE_NLTK", "yes")
|
||||
|
||||
NLTK_LANGUAGE: str | None = _get_nltk_language_setting(OCR_LANGUAGE)
|
||||
|
||||
SEARCH_LANGUAGE: str | None = _get_search_language_setting(OCR_LANGUAGE)
|
||||
|
||||
###############################################################################
|
||||
# Email Preprocessors #
|
||||
###############################################################################
|
||||
|
||||
Reference in New Issue
Block a user