paperless-ngx/src/paperless/parsers/text.py

"""
Built-in plain-text document parser.

Handles text/plain, text/csv, and application/csv MIME types by reading the
file content directly.  Thumbnails are generated by rendering a page-sized
WebP image from the first 100,000 characters using Pillow.
"""

from __future__ import annotations

import logging
import shutil
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Self

from django.conf import settings
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont

from paperless.parsers.utils import read_file_handle_unicode_errors
from paperless.version import __full_version_str__

if TYPE_CHECKING:
    import datetime
    from types import TracebackType

    from paperless.parsers import MetadataEntry
    from paperless.parsers import ParserContext

logger = logging.getLogger("paperless.parsing.text")

_SUPPORTED_MIME_TYPES: dict[str, str] = {
    "text/plain": ".txt",
    "text/csv": ".csv",
    "application/csv": ".csv",
}


class TextDocumentParser:
    """Parse plain-text documents (txt, csv) for Paperless-ngx.

    This parser reads the file content directly as UTF-8 text and renders a
    simple thumbnail using Pillow.  It does not perform OCR and does not
    produce a searchable PDF archive copy.

    Class attributes
    ----------------
    name : str
        Human-readable parser name.
    version : str
        Semantic version string, kept in sync with Paperless-ngx releases.
    author : str
        Maintainer name.
    url : str
        Issue tracker / source URL.
    """

    name: str = "Paperless-ngx Text Parser"
    version: str = __full_version_str__
    author: str = "Paperless-ngx Contributors"
    url: str = "https://github.com/paperless-ngx/paperless-ngx"

    # ------------------------------------------------------------------
    # Class methods
    # ------------------------------------------------------------------

    @classmethod
    def supported_mime_types(cls) -> dict[str, str]:
        """Return the MIME types this parser handles.

        Returns
        -------
        dict[str, str]
            Mapping of MIME type to preferred file extension.
        """
        return _SUPPORTED_MIME_TYPES

    @classmethod
    def score(
        cls,
        mime_type: str,
        filename: str,
        path: Path | None = None,
    ) -> int | None:
        """Return the priority score for handling this file.

        Parameters
        ----------
        mime_type:
            Detected MIME type of the file.
        filename:
            Original filename including extension.
        path:
            Optional filesystem path. Not inspected by this parser.

        Returns
        -------
        int | None
            10 if the MIME type is supported, otherwise None.
        """
        if mime_type in _SUPPORTED_MIME_TYPES:
            return 10
        return None

    # ------------------------------------------------------------------
    # Properties
    # ------------------------------------------------------------------

    @property
    def can_produce_archive(self) -> bool:
        """Whether this parser can produce a searchable PDF archive copy.

        Returns
        -------
        bool
            Always False — the text parser does not produce a PDF archive.
        """
        return False

    @property
    def requires_pdf_rendition(self) -> bool:
        """Whether the parser must produce a PDF for the frontend to display.

        Returns
        -------
        bool
            Always False — plain text files are displayable as-is.
        """
        return False

    # ------------------------------------------------------------------
    # Lifecycle
    # ------------------------------------------------------------------

    def __init__(self, logging_group: object = None) -> None:
        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
        self._tempdir = Path(
            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
        )
        self._text: str | None = None

    def __enter__(self) -> Self:
        return self

    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
        logger.debug("Cleaning up temporary directory %s", self._tempdir)
        shutil.rmtree(self._tempdir, ignore_errors=True)

    # ------------------------------------------------------------------
    # Core parsing interface
    # ------------------------------------------------------------------

    def configure(self, context: ParserContext) -> None:
        pass

    def parse(
        self,
        document_path: Path,
        mime_type: str,
        *,
        produce_archive: bool = True,
    ) -> None:
        """Read the document and store its text content.

        Parameters
        ----------
        document_path:
            Absolute path to the text file.
        mime_type:
            Detected MIME type of the document.
        produce_archive:
            Ignored — this parser never produces a PDF archive.

        Raises
        ------
        documents.parsers.ParseError
            If the file cannot be read.
        """
        self._text = read_file_handle_unicode_errors(document_path, log=logger)

    # ------------------------------------------------------------------
    # Result accessors
    # ------------------------------------------------------------------

    def get_text(self) -> str:
        """Return the plain-text content extracted during parse.

        Returns
        -------
        str
            Extracted text, or an empty string if no text could be found.
        """
        return self._text or ""

    def get_date(self) -> datetime.datetime | None:
        """Return the document date detected during parse.

        Returns
        -------
        datetime.datetime | None
            Always None — the text parser does not detect dates.
        """
        return None

    def get_archive_path(self) -> Path | None:
        """Return the path to a generated archive PDF, or None.

        Returns
        -------
        Path | None
            Always None — the text parser does not produce a PDF archive.
        """
        return None

    # ------------------------------------------------------------------
    # Thumbnail and metadata
    # ------------------------------------------------------------------

    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
        """Render the first portion of the document as a WebP thumbnail.

        Parameters
        ----------
        document_path:
            Absolute path to the source document.
        mime_type:
            Detected MIME type of the document.

        Returns
        -------
        Path
            Path to the generated WebP thumbnail inside the temporary directory.
        """
        max_chars = 100_000
        file_size_limit = 50 * 1024 * 1024

        if document_path.stat().st_size > file_size_limit:
            text = "[File too large to preview]"
        else:
            with Path(document_path).open("r", encoding="utf-8", errors="replace") as f:
                text = f.read(max_chars)

        img = Image.new("RGB", (500, 700), color="white")
        draw = ImageDraw.Draw(img)
        font = ImageFont.truetype(
            font=settings.THUMBNAIL_FONT_NAME,
            size=20,
            layout_engine=ImageFont.Layout.BASIC,
        )
        draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)

        out_path = self._tempdir / "thumb.webp"
        img.save(out_path, format="WEBP")

        return out_path

    def get_page_count(
        self,
        document_path: Path,
        mime_type: str,
    ) -> int | None:
        """Return the number of pages in the document.

        Parameters
        ----------
        document_path:
            Absolute path to the source document.
        mime_type:
            Detected MIME type of the document.

        Returns
        -------
        int | None
            Always None — page count is not meaningful for plain text.
        """
        return None

    def extract_metadata(
        self,
        document_path: Path,
        mime_type: str,
    ) -> list[MetadataEntry]:
        """Extract format-specific metadata from the document.

        Returns
        -------
        list[MetadataEntry]
            Always ``[]`` — plain text files carry no structured metadata.
        """
        return []