paperless-ngx/src/documents/search/_schema.py

from __future__ import annotations

import logging
import shutil
from typing import TYPE_CHECKING

import tantivy
from django.conf import settings

if TYPE_CHECKING:
    from pathlib import Path

logger = logging.getLogger("paperless.search")

SCHEMA_VERSION = 1


def build_schema() -> tantivy.Schema:
    """
    Build the Tantivy schema for the paperless document index.

    Creates a comprehensive schema supporting full-text search, filtering,
    sorting, and autocomplete functionality. Includes fields for document
    content, metadata, permissions, custom fields, and notes.

    Returns:
        Configured Tantivy schema ready for index creation
    """
    sb = tantivy.SchemaBuilder()

    sb.add_unsigned_field("id", stored=True, indexed=True, fast=True)
    sb.add_text_field("checksum", stored=True, tokenizer_name="raw")

    for field in (
        "title",
        "correspondent",
        "document_type",
        "storage_path",
        "original_filename",
        "content",
    ):
        sb.add_text_field(field, stored=True, tokenizer_name="paperless_text")

    # Shadow sort fields - fast, not stored/indexed
    for field in ("title_sort", "correspondent_sort", "type_sort"):
        sb.add_text_field(
            field,
            stored=False,
            tokenizer_name="simple_analyzer",
            fast=True,
        )

    # CJK support - not stored, indexed only
    sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer")

    # Autocomplete prefix scan - stored, not indexed
    sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw")

    sb.add_text_field("tag", stored=True, tokenizer_name="paperless_text")

    # JSON fields — structured queries: notes.user:alice, custom_fields.name:invoice
    # tantivy-py 0.25 does not support dotted paths in parse_query default_field_names,
    # so companion text fields (note, custom_field) carry content for default full-text search.
    sb.add_json_field("notes", stored=True, tokenizer_name="paperless_text")
    sb.add_json_field("custom_fields", stored=True, tokenizer_name="paperless_text")

    # Companion text fields for default full-text search (not stored — no extra disk cost)
    sb.add_text_field("note", stored=False, tokenizer_name="paperless_text")
    sb.add_text_field("custom_field", stored=False, tokenizer_name="paperless_text")

    for field in (
        "correspondent_id",
        "document_type_id",
        "storage_path_id",
        "tag_id",
        "owner_id",
        "viewer_id",
    ):
        sb.add_unsigned_field(field, stored=False, indexed=True, fast=True)

    for field in ("created", "modified", "added"):
        sb.add_date_field(field, stored=True, indexed=True, fast=True)

    for field in ("asn", "page_count", "num_notes"):
        sb.add_unsigned_field(field, stored=True, indexed=True, fast=True)

    return sb.build()


def needs_rebuild(index_dir: Path) -> bool:
    """
    Check if the search index needs rebuilding.

    Compares the current schema version and search language configuration
    against sentinel files to determine if the index is compatible with
    the current paperless-ngx version and settings.

    Args:
        index_dir: Path to the search index directory

    Returns:
        True if the index needs rebuilding, False if it's up to date
    """
    version_file = index_dir / ".schema_version"
    if not version_file.exists():
        return True
    try:
        if int(version_file.read_text().strip()) != SCHEMA_VERSION:
            logger.info("Search index schema version mismatch - rebuilding.")
            return True
    except ValueError:
        return True

    language_file = index_dir / ".schema_language"
    if not language_file.exists():
        logger.info("Search index language sentinel missing - rebuilding.")
        return True
    if language_file.read_text().strip() != (settings.SEARCH_LANGUAGE or ""):
        logger.info("Search index language changed - rebuilding.")
        return True

    return False


def wipe_index(index_dir: Path) -> None:
    """
    Delete all contents of the index directory to prepare for rebuild.

    Recursively removes all files and subdirectories within the index
    directory while preserving the directory itself.

    Args:
        index_dir: Path to the search index directory to clear
    """
    for child in list(index_dir.iterdir()):
        if child.is_dir():
            shutil.rmtree(child)
        else:
            child.unlink()


def _write_sentinels(index_dir: Path) -> None:
    """Write schema version and language sentinel files so the next index open can skip rebuilding."""
    (index_dir / ".schema_version").write_text(str(SCHEMA_VERSION))
    (index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE or "")


def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
    """
    Open the Tantivy index, creating or rebuilding as needed.

    Checks if the index needs rebuilding due to schema version or language
    changes. If rebuilding is needed, wipes the directory and creates a fresh
    index with the current schema and configuration.

    Args:
        index_dir: Path to index directory (defaults to settings.INDEX_DIR)

    Returns:
        Opened Tantivy index (caller must register custom tokenizers)
    """
    if index_dir is None:
        index_dir = settings.INDEX_DIR
    if not index_dir.exists():
        return tantivy.Index(build_schema())
    if needs_rebuild(index_dir):
        wipe_index(index_dir)
        idx = tantivy.Index(build_schema(), path=str(index_dir))
        _write_sentinels(index_dir)
        return idx
    return tantivy.Index.open(str(index_dir))