diff --git a/src/paperless/parsers/text.py b/src/paperless/parsers/text.py index a6c149a0a..99d9dab08 100644 --- a/src/paperless/parsers/text.py +++ b/src/paperless/parsers/text.py @@ -1,22 +1,239 @@ +""" +Built-in plain-text document parser. + +Handles text/plain, text/csv, and application/csv MIME types by reading the +file content directly. Thumbnails are generated by rendering a page-sized +WebP image from the first 100,000 characters using Pillow. +""" + +from __future__ import annotations + +import logging +import shutil +import tempfile from pathlib import Path +from typing import TYPE_CHECKING +from typing import Self from django.conf import settings from PIL import Image from PIL import ImageDraw from PIL import ImageFont -from documents.parsers import DocumentParser +from paperless.version import __full_version_str__ + +if TYPE_CHECKING: + import datetime + from types import TracebackType + + from paperless.parsers import MetadataEntry + +logger = logging.getLogger("paperless.parsing.text") + +_SUPPORTED_MIME_TYPES: dict[str, str] = { + "text/plain": ".txt", + "text/csv": ".csv", + "application/csv": ".csv", +} -class TextDocumentParser(DocumentParser): - """ - This parser directly parses a text document (.txt, .md, or .csv) +class TextDocumentParser: + """Parse plain-text documents (txt, csv) for Paperless-ngx. + + This parser reads the file content directly as UTF-8 text and renders a + simple thumbnail using Pillow. It does not perform OCR and does not + produce a searchable PDF archive copy. + + Class attributes + ---------------- + name : str + Human-readable parser name. + version : str + Semantic version string, kept in sync with Paperless-ngx releases. + author : str + Maintainer name. + url : str + Issue tracker / source URL. """ - logging_name = "paperless.parsing.text" + name: str = "Paperless-ngx Text Parser" + version: str = __full_version_str__ + author: str = "Paperless-ngx Contributors" + url: str = "https://github.com/paperless-ngx/paperless-ngx" - def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path: - # Avoid reading entire file into memory + # ------------------------------------------------------------------ + # Class methods + # ------------------------------------------------------------------ + + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + """Return the MIME types this parser handles. + + Returns + ------- + dict[str, str] + Mapping of MIME type to preferred file extension. + """ + return _SUPPORTED_MIME_TYPES + + @classmethod + def score( + cls, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> int | None: + """Return the priority score for handling this file. + + Parameters + ---------- + mime_type: + Detected MIME type of the file. + filename: + Original filename including extension. + path: + Optional filesystem path. Not inspected by this parser. + + Returns + ------- + int | None + 10 if the MIME type is supported, otherwise None. + """ + if mime_type in _SUPPORTED_MIME_TYPES: + return 10 + return None + + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def can_produce_archive(self) -> bool: + """Whether this parser can produce a searchable PDF archive copy. + + Returns + ------- + bool + Always False — the text parser does not produce a PDF archive. + """ + return False + + @property + def requires_pdf_rendition(self) -> bool: + """Whether the parser must produce a PDF for the frontend to display. + + Returns + ------- + bool + Always False — plain text files are displayable as-is. + """ + return False + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def __init__(self, logging_group: object = None) -> None: + settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) + self._tempdir = Path( + tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), + ) + self._text: str | None = None + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + logger.debug("Cleaning up temporary directory %s", self._tempdir) + shutil.rmtree(self._tempdir, ignore_errors=True) + + # ------------------------------------------------------------------ + # Core parsing interface + # ------------------------------------------------------------------ + + def parse( + self, + document_path: Path, + mime_type: str, + *, + produce_archive: bool = True, + ) -> None: + """Read the document and store its text content. + + Parameters + ---------- + document_path: + Absolute path to the text file. + mime_type: + Detected MIME type of the document. + produce_archive: + Ignored — this parser never produces a PDF archive. + + Raises + ------ + documents.parsers.ParseError + If the file cannot be read. + """ + self._text = self._read_text(document_path) + + # ------------------------------------------------------------------ + # Result accessors + # ------------------------------------------------------------------ + + def get_text(self) -> str | None: + """Return the plain-text content extracted during parse. + + Returns + ------- + str | None + Extracted text, or None if parse has not been called yet. + """ + return self._text + + def get_date(self) -> datetime.datetime | None: + """Return the document date detected during parse. + + Returns + ------- + datetime.datetime | None + Always None — the text parser does not detect dates. + """ + return None + + def get_archive_path(self) -> Path | None: + """Return the path to a generated archive PDF, or None. + + Returns + ------- + Path | None + Always None — the text parser does not produce a PDF archive. + """ + return None + + # ------------------------------------------------------------------ + # Thumbnail and metadata + # ------------------------------------------------------------------ + + def get_thumbnail(self, document_path: Path, mime_type: str) -> Path: + """Render the first portion of the document as a WebP thumbnail. + + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + + Returns + ------- + Path + Path to the generated WebP thumbnail inside the temporary directory. + """ max_chars = 100_000 file_size_limit = 50 * 1024 * 1024 @@ -35,16 +252,69 @@ class TextDocumentParser(DocumentParser): ) draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4) - out_path = self.tempdir / "thumb.webp" + out_path = self._tempdir / "thumb.webp" img.save(out_path, format="WEBP") return out_path - def parse(self, document_path, mime_type, file_name=None) -> None: - self.text = self.read_file_handle_unicode_errors(document_path) + def get_page_count( + self, + document_path: Path, + mime_type: str, + ) -> int | None: + """Return the number of pages in the document. - def get_settings(self) -> None: - """ - This parser does not implement additional settings yet + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + + Returns + ------- + int | None + Always None — page count is not meaningful for plain text. """ return None + + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list[MetadataEntry]: + """Extract format-specific metadata from the document. + + Returns + ------- + list[MetadataEntry] + Always ``[]`` — plain text files carry no structured metadata. + """ + return [] + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + def _read_text(self, filepath: Path) -> str: + """Read file content, replacing invalid UTF-8 bytes rather than failing. + + Parameters + ---------- + filepath: + Path to the file to read. + + Returns + ------- + str + File content as a string. + """ + try: + return filepath.read_text(encoding="utf-8") + except UnicodeDecodeError as exc: + logger.warning( + "Unicode error reading %s, replacing bad bytes: %s", + filepath, + exc, + ) + return filepath.read_bytes().decode("utf-8", errors="replace")