paperless-ngx/src/documents/search/_schema.py

from __future__ import annotations

import json
import logging
import shutil
from typing import TYPE_CHECKING
from typing import Final
from typing import cast

import tantivy
from django.conf import settings

if TYPE_CHECKING:
    from pathlib import Path

logger = logging.getLogger("paperless.search")

# v1 - Initial tantivy schema format
SCHEMA_VERSION: Final[int] = 1


def build_schema() -> tantivy.Schema:
    """
    Build the Tantivy schema for the paperless document index.

    Creates a comprehensive schema supporting full-text search, filtering,
    sorting, and autocomplete functionality. Includes fields for document
    content, metadata, permissions, custom fields, and notes.

    Returns:
        Configured Tantivy schema ready for index creation
    """
    sb = tantivy.SchemaBuilder()

    sb.add_unsigned_field("id", stored=True, indexed=True, fast=True)
    sb.add_text_field("checksum", stored=True, tokenizer_name="raw")

    for field in (
        "title",
        "correspondent",
        "document_type",
        "storage_path",
        "original_filename",
        "content",
    ):
        sb.add_text_field(field, stored=True, tokenizer_name="paperless_text")

    # Shadow sort fields - fast, not stored/indexed
    for field in ("title_sort", "correspondent_sort", "type_sort"):
        sb.add_text_field(
            field,
            stored=False,
            tokenizer_name="simple_analyzer",
            fast=True,
        )

    # CJK support - not stored, indexed only
    sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer")

    # Simple substring search support for title/content - not stored, indexed only
    sb.add_text_field(
        "simple_title",
        stored=False,
        tokenizer_name="simple_search_analyzer",
    )
    sb.add_text_field(
        "simple_content",
        stored=False,
        tokenizer_name="simple_search_analyzer",
    )

    # Autocomplete prefix scan - stored, not indexed
    sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw")

    sb.add_text_field("tag", stored=True, tokenizer_name="paperless_text")

    # JSON fields — structured queries: notes.user:alice, custom_fields.name:invoice
    sb.add_json_field("notes", stored=True, tokenizer_name="paperless_text")
    # Plain-text companion for notes — tantivy's SnippetGenerator does not support
    # JSON fields, so highlights require a text field with the same content.
    sb.add_text_field("notes_text", stored=True, tokenizer_name="paperless_text")
    sb.add_json_field("custom_fields", stored=True, tokenizer_name="paperless_text")

    for field in (
        "correspondent_id",
        "document_type_id",
        "storage_path_id",
        "tag_id",
        "owner_id",
        "viewer_id",
    ):
        sb.add_unsigned_field(field, stored=False, indexed=True, fast=True)

    for field in ("created", "modified", "added"):
        sb.add_date_field(field, stored=True, indexed=True, fast=True)

    for field in ("asn", "page_count", "num_notes"):
        sb.add_unsigned_field(field, stored=True, indexed=True, fast=True)

    return sb.build()


def needs_rebuild(index_dir: Path) -> bool:
    """
    Check if the search index needs rebuilding.

    Reads .index_settings.json to compare the stored schema version and
    search language against the current configuration. Returns True if the
    file is missing, unparsable, or either value mismatches.

    Args:
        index_dir: Path to the search index directory

    Returns:
        True if the index needs rebuilding, False if it's up to date
    """
    settings_file = index_dir / ".index_settings.json"
    if not settings_file.exists():
        return True
    try:
        data = json.loads(settings_file.read_text())
        if data.get("schema_version") != SCHEMA_VERSION:
            logger.info("Search index schema version mismatch - rebuilding.")
            return True
        if "language" not in data or data["language"] != settings.SEARCH_LANGUAGE:
            logger.info("Search index language changed - rebuilding.")
            return True
    except ValueError:
        return True
    return False


def wipe_index(index_dir: Path) -> None:
    """
    Delete all contents of the index directory to prepare for rebuild.

    Recursively removes all files and subdirectories within the index
    directory while preserving the directory itself.

    Args:
        index_dir: Path to the search index directory to clear
    """
    for child in index_dir.iterdir():
        if child.is_dir():
            shutil.rmtree(child)
        else:
            child.unlink()


def _write_sentinels(index_dir: Path) -> None:
    """Write .index_settings.json so the next index open can skip rebuilding."""
    settings_file = index_dir / ".index_settings.json"
    settings_file.write_text(
        json.dumps(
            {
                "schema_version": SCHEMA_VERSION,
                "language": settings.SEARCH_LANGUAGE,
            },
        ),
    )


def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
    """
    Open the Tantivy index, creating or rebuilding as needed.

    Checks if the index needs rebuilding due to schema version or language
    changes. If rebuilding is needed, wipes the directory and creates a fresh
    index with the current schema and configuration.

    Args:
        index_dir: Path to index directory (defaults to settings.INDEX_DIR)

    Returns:
        Opened Tantivy index (caller must register custom tokenizers)
    """
    if index_dir is None:
        index_dir = cast("Path", settings.INDEX_DIR)
    if not index_dir.exists():
        return tantivy.Index(build_schema())
    if needs_rebuild(index_dir):
        wipe_index(index_dir)
        idx = tantivy.Index(build_schema(), path=str(index_dir))
        _write_sentinels(index_dir)
        return idx
    return tantivy.Index.open(str(index_dir))