diff --git a/docs/configuration.md b/docs/configuration.md index 0482e9ee4..a22171ce9 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1105,12 +1105,21 @@ should be a valid crontab(5) expression describing when to run. #### [`PAPERLESS_SEARCH_LANGUAGE=`](#PAPERLESS_SEARCH_LANGUAGE) {#PAPERLESS_SEARCH_LANGUAGE} -: Sets the stemmer language for the full-text search index (e.g. `en`, `de`, `fr`). +: Sets the stemmer language for the full-text search index. Stemming improves recall by matching word variants (e.g. "running" matches "run"). Changing this setting causes the index to be rebuilt automatically on next startup. -Supported values are the language names accepted by Tantivy's built-in stemmer. +An invalid value raises an error at startup. - Defaults to `""` (no stemming). +: Use the ISO 639-1 two-letter code (e.g. `en`, `de`, `fr`). Lowercase full names +(e.g. `english`, `german`, `french`) are also accepted. The capitalized names shown +in the [Tantivy Language enum](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) +documentation are **not** valid — use the lowercase equivalent. + +: If not set, paperless infers the language from +[`PAPERLESS_OCR_LANGUAGE`](#PAPERLESS_OCR_LANGUAGE). If the OCR language has no +Tantivy stemmer equivalent, stemming is disabled. + + Defaults to unset (inferred from `PAPERLESS_OCR_LANGUAGE`). #### [`PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD=`](#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD) {#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD} diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py index d41aeeb55..cb5b85e3e 100644 --- a/src/documents/search/_schema.py +++ b/src/documents/search/_schema.py @@ -94,7 +94,7 @@ def _needs_rebuild(index_dir: Path) -> bool: if not language_file.exists(): logger.info("Search index language sentinel missing - rebuilding.") return True - if language_file.read_text().strip() != settings.SEARCH_LANGUAGE: + if language_file.read_text().strip() != (settings.SEARCH_LANGUAGE or ""): logger.info("Search index language changed - rebuilding.") return True @@ -113,7 +113,7 @@ def wipe_index(index_dir: Path) -> None: def _write_sentinels(index_dir: Path) -> None: """Write schema version and language sentinel files so the next index open can skip rebuilding.""" (index_dir / ".schema_version").write_text(str(SCHEMA_VERSION)) - (index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE) + (index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE or "") def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index: diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py index ed30d5c37..628b9dcdb 100644 --- a/src/documents/search/_tokenizer.py +++ b/src/documents/search/_tokenizer.py @@ -49,7 +49,7 @@ _LANGUAGE_MAP: dict[str, str] = { SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP) -def register_tokenizers(index: tantivy.Index, language: str) -> None: +def register_tokenizers(index: tantivy.Index, language: str | None) -> None: """ Register all custom tokenizers on *index*. Must be called on every Index instance — tantivy requires re-registration at each open. @@ -66,7 +66,7 @@ def register_tokenizers(index: tantivy.Index, language: str) -> None: index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer()) -def _paperless_text(language: str) -> tantivy.TextAnalyzer: +def _paperless_text(language: str | None) -> tantivy.TextAnalyzer: """Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]""" builder = ( tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple()) diff --git a/src/paperless/settings/__init__.py b/src/paperless/settings/__init__.py index 4fee16dde..3522b3187 100644 --- a/src/paperless/settings/__init__.py +++ b/src/paperless/settings/__init__.py @@ -21,6 +21,7 @@ from paperless.settings.custom import parse_hosting_settings from paperless.settings.custom import parse_ignore_dates from paperless.settings.custom import parse_redis_url from paperless.settings.parsers import get_bool_from_env +from paperless.settings.parsers import get_choice_from_env from paperless.settings.parsers import get_float_from_env from paperless.settings.parsers import get_int_from_env from paperless.settings.parsers import get_list_from_env @@ -86,8 +87,6 @@ EMPTY_TRASH_DIR = ( MEDIA_LOCK = MEDIA_ROOT / "media.lock" INDEX_DIR = DATA_DIR / "index" -SEARCH_LANGUAGE: str = os.getenv("PAPERLESS_SEARCH_LANGUAGE", "") - ADVANCED_FUZZY_SEARCH_THRESHOLD: float | None = get_float_from_env( "PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD", ) @@ -1040,10 +1039,55 @@ def _get_nltk_language_setting(ocr_lang: str) -> str | None: return iso_code_to_nltk.get(ocr_lang) +def _get_search_language_setting(ocr_lang: str) -> str | None: + """ + Determine the Tantivy stemmer language. + + If PAPERLESS_SEARCH_LANGUAGE is explicitly set, it is validated against + the languages supported by Tantivy's built-in stemmer and returned as-is. + Otherwise the primary Tesseract language code from PAPERLESS_OCR_LANGUAGE + is mapped to the corresponding ISO 639-1 code understood by Tantivy. + Returns None when unset and the OCR language has no Tantivy stemmer. + """ + explicit = os.environ.get("PAPERLESS_SEARCH_LANGUAGE") + if explicit is not None: + # Lazy import avoids any app-loading order concerns; _tokenizer has no + # Django dependencies so this is safe. + from documents.search._tokenizer import SUPPORTED_LANGUAGES + + return get_choice_from_env("PAPERLESS_SEARCH_LANGUAGE", SUPPORTED_LANGUAGES) + + # Infer from the primary Tesseract language code (ISO 639-2/T → ISO 639-1) + primary = ocr_lang.split("+", maxsplit=1)[0].lower() + _ocr_to_search: dict[str, str] = { + "ara": "ar", + "dan": "da", + "nld": "nl", + "eng": "en", + "fin": "fi", + "fra": "fr", + "deu": "de", + "ell": "el", + "hun": "hu", + "ita": "it", + "nor": "no", + "por": "pt", + "ron": "ro", + "rus": "ru", + "spa": "es", + "swe": "sv", + "tam": "ta", + "tur": "tr", + } + return _ocr_to_search.get(primary) + + NLTK_ENABLED: Final[bool] = get_bool_from_env("PAPERLESS_ENABLE_NLTK", "yes") NLTK_LANGUAGE: str | None = _get_nltk_language_setting(OCR_LANGUAGE) +SEARCH_LANGUAGE: str | None = _get_search_language_setting(OCR_LANGUAGE) + ############################################################################### # Email Preprocessors # ###############################################################################