Feat: refactor TextDocumentParser to ParserProtocol

Starting from the moved paperless_text/parsers.py, rewrite the class to
satisfy ParserProtocol without inheriting from the old DocumentParser base:

- Add class-level identity attributes (name, version, author, url)
- Add supported_mime_types() and score() classmethods
- Add can_produce_archive and requires_pdf_rendition properties (both False)
- Replace tempdir / read_file_handle_unicode_errors from old base class with
  a self-contained __init__, __enter__, __exit__, and _read_text helper
- Drop file_name parameter from parse() and get_thumbnail(); add produce_archive kwarg
- Add extract_metadata() returning [] (plain text has no structured metadata)
- Remove get_settings() (not part of ParserProtocol)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-09 16:54:52 -07:00
parent 8c08362ebc
commit 7eb417e796

View File

@@ -1,22 +1,239 @@
"""
Built-in plain-text document parser.
Handles text/plain, text/csv, and application/csv MIME types by reading the
file content directly. Thumbnails are generated by rendering a page-sized
WebP image from the first 100,000 characters using Pillow.
"""
from __future__ import annotations
import logging
import shutil
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Self
from django.conf import settings
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from documents.parsers import DocumentParser
from paperless.version import __full_version_str__
if TYPE_CHECKING:
import datetime
from types import TracebackType
from paperless.parsers import MetadataEntry
logger = logging.getLogger("paperless.parsing.text")
_SUPPORTED_MIME_TYPES: dict[str, str] = {
"text/plain": ".txt",
"text/csv": ".csv",
"application/csv": ".csv",
}
class TextDocumentParser(DocumentParser):
"""
This parser directly parses a text document (.txt, .md, or .csv)
class TextDocumentParser:
"""Parse plain-text documents (txt, csv) for Paperless-ngx.
This parser reads the file content directly as UTF-8 text and renders a
simple thumbnail using Pillow. It does not perform OCR and does not
produce a searchable PDF archive copy.
Class attributes
----------------
name : str
Human-readable parser name.
version : str
Semantic version string, kept in sync with Paperless-ngx releases.
author : str
Maintainer name.
url : str
Issue tracker / source URL.
"""
logging_name = "paperless.parsing.text"
name: str = "Paperless-ngx Text Parser"
version: str = __full_version_str__
author: str = "Paperless-ngx Contributors"
url: str = "https://github.com/paperless-ngx/paperless-ngx"
def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
# Avoid reading entire file into memory
# ------------------------------------------------------------------
# Class methods
# ------------------------------------------------------------------
@classmethod
def supported_mime_types(cls) -> dict[str, str]:
"""Return the MIME types this parser handles.
Returns
-------
dict[str, str]
Mapping of MIME type to preferred file extension.
"""
return _SUPPORTED_MIME_TYPES
@classmethod
def score(
cls,
mime_type: str,
filename: str,
path: Path | None = None,
) -> int | None:
"""Return the priority score for handling this file.
Parameters
----------
mime_type:
Detected MIME type of the file.
filename:
Original filename including extension.
path:
Optional filesystem path. Not inspected by this parser.
Returns
-------
int | None
10 if the MIME type is supported, otherwise None.
"""
if mime_type in _SUPPORTED_MIME_TYPES:
return 10
return None
# ------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------
@property
def can_produce_archive(self) -> bool:
"""Whether this parser can produce a searchable PDF archive copy.
Returns
-------
bool
Always False — the text parser does not produce a PDF archive.
"""
return False
@property
def requires_pdf_rendition(self) -> bool:
"""Whether the parser must produce a PDF for the frontend to display.
Returns
-------
bool
Always False — plain text files are displayable as-is.
"""
return False
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def __init__(self, logging_group: object = None) -> None:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
self._tempdir = Path(
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
)
self._text: str | None = None
def __enter__(self) -> Self:
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
logger.debug("Cleaning up temporary directory %s", self._tempdir)
shutil.rmtree(self._tempdir, ignore_errors=True)
# ------------------------------------------------------------------
# Core parsing interface
# ------------------------------------------------------------------
def parse(
self,
document_path: Path,
mime_type: str,
*,
produce_archive: bool = True,
) -> None:
"""Read the document and store its text content.
Parameters
----------
document_path:
Absolute path to the text file.
mime_type:
Detected MIME type of the document.
produce_archive:
Ignored — this parser never produces a PDF archive.
Raises
------
documents.parsers.ParseError
If the file cannot be read.
"""
self._text = self._read_text(document_path)
# ------------------------------------------------------------------
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
"""Return the plain-text content extracted during parse.
Returns
-------
str | None
Extracted text, or None if parse has not been called yet.
"""
return self._text
def get_date(self) -> datetime.datetime | None:
"""Return the document date detected during parse.
Returns
-------
datetime.datetime | None
Always None — the text parser does not detect dates.
"""
return None
def get_archive_path(self) -> Path | None:
"""Return the path to a generated archive PDF, or None.
Returns
-------
Path | None
Always None — the text parser does not produce a PDF archive.
"""
return None
# ------------------------------------------------------------------
# Thumbnail and metadata
# ------------------------------------------------------------------
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
"""Render the first portion of the document as a WebP thumbnail.
Parameters
----------
document_path:
Absolute path to the source document.
mime_type:
Detected MIME type of the document.
Returns
-------
Path
Path to the generated WebP thumbnail inside the temporary directory.
"""
max_chars = 100_000
file_size_limit = 50 * 1024 * 1024
@@ -35,16 +252,69 @@ class TextDocumentParser(DocumentParser):
)
draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
out_path = self.tempdir / "thumb.webp"
out_path = self._tempdir / "thumb.webp"
img.save(out_path, format="WEBP")
return out_path
def parse(self, document_path, mime_type, file_name=None) -> None:
self.text = self.read_file_handle_unicode_errors(document_path)
def get_page_count(
self,
document_path: Path,
mime_type: str,
) -> int | None:
"""Return the number of pages in the document.
def get_settings(self) -> None:
"""
This parser does not implement additional settings yet
Parameters
----------
document_path:
Absolute path to the source document.
mime_type:
Detected MIME type of the document.
Returns
-------
int | None
Always None — page count is not meaningful for plain text.
"""
return None
def extract_metadata(
self,
document_path: Path,
mime_type: str,
) -> list[MetadataEntry]:
"""Extract format-specific metadata from the document.
Returns
-------
list[MetadataEntry]
Always ``[]`` — plain text files carry no structured metadata.
"""
return []
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
def _read_text(self, filepath: Path) -> str:
"""Read file content, replacing invalid UTF-8 bytes rather than failing.
Parameters
----------
filepath:
Path to the file to read.
Returns
-------
str
File content as a string.
"""
try:
return filepath.read_text(encoding="utf-8")
except UnicodeDecodeError as exc:
logger.warning(
"Unicode error reading %s, replacing bad bytes: %s",
filepath,
exc,
)
return filepath.read_bytes().decode("utf-8", errors="replace")