mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-07 22:29:45 +00:00
Enhancement: infer SEARCH_LANGUAGE from OCR_LANGUAGE; validate if explicit
- SEARCH_LANGUAGE is now str | None (None = no stemming, not "") - When PAPERLESS_SEARCH_LANGUAGE is set, validate it against SUPPORTED_LANGUAGES via get_choice_from_env (startup error on bad value) - When not set, infer from OCR_LANGUAGE's primary Tesseract code (eng→en, deu→de, fra→fr, etc.) covering all 18 Tantivy-supported languages - _schema.py sentinel normalises None → "" for on-disk comparison - _tokenizer.py type annotations updated to str | None - docs: recommend ISO 639-1 two-letter codes; note that capitalized Tantivy enum names are not valid; link to Tantivy Language enum Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -94,7 +94,7 @@ def _needs_rebuild(index_dir: Path) -> bool:
|
||||
if not language_file.exists():
|
||||
logger.info("Search index language sentinel missing - rebuilding.")
|
||||
return True
|
||||
if language_file.read_text().strip() != settings.SEARCH_LANGUAGE:
|
||||
if language_file.read_text().strip() != (settings.SEARCH_LANGUAGE or ""):
|
||||
logger.info("Search index language changed - rebuilding.")
|
||||
return True
|
||||
|
||||
@@ -113,7 +113,7 @@ def wipe_index(index_dir: Path) -> None:
|
||||
def _write_sentinels(index_dir: Path) -> None:
|
||||
"""Write schema version and language sentinel files so the next index open can skip rebuilding."""
|
||||
(index_dir / ".schema_version").write_text(str(SCHEMA_VERSION))
|
||||
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE)
|
||||
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE or "")
|
||||
|
||||
|
||||
def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
|
||||
|
||||
@@ -49,7 +49,7 @@ _LANGUAGE_MAP: dict[str, str] = {
|
||||
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
||||
|
||||
|
||||
def register_tokenizers(index: tantivy.Index, language: str) -> None:
|
||||
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||
"""
|
||||
Register all custom tokenizers on *index*. Must be called on every Index
|
||||
instance — tantivy requires re-registration at each open.
|
||||
@@ -66,7 +66,7 @@ def register_tokenizers(index: tantivy.Index, language: str) -> None:
|
||||
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
|
||||
|
||||
def _paperless_text(language: str) -> tantivy.TextAnalyzer:
|
||||
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
|
||||
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
|
||||
builder = (
|
||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
||||
|
||||
Reference in New Issue
Block a user