mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-30 17:24:22 +00:00
299 lines
8.4 KiB
Python
299 lines
8.4 KiB
Python
"""
|
|
Built-in plain-text document parser.
|
|
|
|
Handles text/plain, text/csv, and application/csv MIME types by reading the
|
|
file content directly. Thumbnails are generated by rendering a page-sized
|
|
WebP image from the first 100,000 characters using Pillow.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import shutil
|
|
import tempfile
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING
|
|
from typing import Self
|
|
|
|
from django.conf import settings
|
|
from PIL import Image
|
|
from PIL import ImageDraw
|
|
from PIL import ImageFont
|
|
|
|
from paperless.parsers.utils import read_file_handle_unicode_errors
|
|
from paperless.version import __full_version_str__
|
|
|
|
if TYPE_CHECKING:
|
|
import datetime
|
|
from types import TracebackType
|
|
|
|
from paperless.parsers import MetadataEntry
|
|
from paperless.parsers import ParserContext
|
|
|
|
logger = logging.getLogger("paperless.parsing.text")
|
|
|
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
|
"text/plain": ".txt",
|
|
"text/csv": ".csv",
|
|
"application/csv": ".csv",
|
|
}
|
|
|
|
|
|
class TextDocumentParser:
|
|
"""Parse plain-text documents (txt, csv) for Paperless-ngx.
|
|
|
|
This parser reads the file content directly as UTF-8 text and renders a
|
|
simple thumbnail using Pillow. It does not perform OCR and does not
|
|
produce a searchable PDF archive copy.
|
|
|
|
Class attributes
|
|
----------------
|
|
name : str
|
|
Human-readable parser name.
|
|
version : str
|
|
Semantic version string, kept in sync with Paperless-ngx releases.
|
|
author : str
|
|
Maintainer name.
|
|
url : str
|
|
Issue tracker / source URL.
|
|
"""
|
|
|
|
name: str = "Paperless-ngx Text Parser"
|
|
version: str = __full_version_str__
|
|
author: str = "Paperless-ngx Contributors"
|
|
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
|
|
|
# ------------------------------------------------------------------
|
|
# Class methods
|
|
# ------------------------------------------------------------------
|
|
|
|
@classmethod
|
|
def supported_mime_types(cls) -> dict[str, str]:
|
|
"""Return the MIME types this parser handles.
|
|
|
|
Returns
|
|
-------
|
|
dict[str, str]
|
|
Mapping of MIME type to preferred file extension.
|
|
"""
|
|
return _SUPPORTED_MIME_TYPES
|
|
|
|
@classmethod
|
|
def score(
|
|
cls,
|
|
mime_type: str,
|
|
filename: str,
|
|
path: Path | None = None,
|
|
) -> int | None:
|
|
"""Return the priority score for handling this file.
|
|
|
|
Parameters
|
|
----------
|
|
mime_type:
|
|
Detected MIME type of the file.
|
|
filename:
|
|
Original filename including extension.
|
|
path:
|
|
Optional filesystem path. Not inspected by this parser.
|
|
|
|
Returns
|
|
-------
|
|
int | None
|
|
10 if the MIME type is supported, otherwise None.
|
|
"""
|
|
if mime_type in _SUPPORTED_MIME_TYPES:
|
|
return 10
|
|
return None
|
|
|
|
# ------------------------------------------------------------------
|
|
# Properties
|
|
# ------------------------------------------------------------------
|
|
|
|
@property
|
|
def can_produce_archive(self) -> bool:
|
|
"""Whether this parser can produce a searchable PDF archive copy.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Always False — the text parser does not produce a PDF archive.
|
|
"""
|
|
return False
|
|
|
|
@property
|
|
def requires_pdf_rendition(self) -> bool:
|
|
"""Whether the parser must produce a PDF for the frontend to display.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Always False — plain text files are displayable as-is.
|
|
"""
|
|
return False
|
|
|
|
# ------------------------------------------------------------------
|
|
# Lifecycle
|
|
# ------------------------------------------------------------------
|
|
|
|
def __init__(self, logging_group: object = None) -> None:
|
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
|
self._tempdir = Path(
|
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
|
)
|
|
self._text: str | None = None
|
|
|
|
def __enter__(self) -> Self:
|
|
return self
|
|
|
|
def __exit__(
|
|
self,
|
|
exc_type: type[BaseException] | None,
|
|
exc_val: BaseException | None,
|
|
exc_tb: TracebackType | None,
|
|
) -> None:
|
|
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Core parsing interface
|
|
# ------------------------------------------------------------------
|
|
|
|
def configure(self, context: ParserContext) -> None:
|
|
pass
|
|
|
|
def parse(
|
|
self,
|
|
document_path: Path,
|
|
mime_type: str,
|
|
*,
|
|
produce_archive: bool = True,
|
|
) -> None:
|
|
"""Read the document and store its text content.
|
|
|
|
Parameters
|
|
----------
|
|
document_path:
|
|
Absolute path to the text file.
|
|
mime_type:
|
|
Detected MIME type of the document.
|
|
produce_archive:
|
|
Ignored — this parser never produces a PDF archive.
|
|
|
|
Raises
|
|
------
|
|
documents.parsers.ParseError
|
|
If the file cannot be read.
|
|
"""
|
|
self._text = read_file_handle_unicode_errors(document_path, log=logger)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Result accessors
|
|
# ------------------------------------------------------------------
|
|
|
|
def get_text(self) -> str:
|
|
"""Return the plain-text content extracted during parse.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
Extracted text, or an empty string if no text could be found.
|
|
"""
|
|
return self._text or ""
|
|
|
|
def get_date(self) -> datetime.datetime | None:
|
|
"""Return the document date detected during parse.
|
|
|
|
Returns
|
|
-------
|
|
datetime.datetime | None
|
|
Always None — the text parser does not detect dates.
|
|
"""
|
|
return None
|
|
|
|
def get_archive_path(self) -> Path | None:
|
|
"""Return the path to a generated archive PDF, or None.
|
|
|
|
Returns
|
|
-------
|
|
Path | None
|
|
Always None — the text parser does not produce a PDF archive.
|
|
"""
|
|
return None
|
|
|
|
# ------------------------------------------------------------------
|
|
# Thumbnail and metadata
|
|
# ------------------------------------------------------------------
|
|
|
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
|
"""Render the first portion of the document as a WebP thumbnail.
|
|
|
|
Parameters
|
|
----------
|
|
document_path:
|
|
Absolute path to the source document.
|
|
mime_type:
|
|
Detected MIME type of the document.
|
|
|
|
Returns
|
|
-------
|
|
Path
|
|
Path to the generated WebP thumbnail inside the temporary directory.
|
|
"""
|
|
max_chars = 100_000
|
|
file_size_limit = 50 * 1024 * 1024
|
|
|
|
if document_path.stat().st_size > file_size_limit:
|
|
text = "[File too large to preview]"
|
|
else:
|
|
with Path(document_path).open("r", encoding="utf-8", errors="replace") as f:
|
|
text = f.read(max_chars)
|
|
|
|
img = Image.new("RGB", (500, 700), color="white")
|
|
draw = ImageDraw.Draw(img)
|
|
font = ImageFont.truetype(
|
|
font=settings.THUMBNAIL_FONT_NAME,
|
|
size=20,
|
|
layout_engine=ImageFont.Layout.BASIC,
|
|
)
|
|
draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
|
|
|
|
out_path = self._tempdir / "thumb.webp"
|
|
img.save(out_path, format="WEBP")
|
|
|
|
return out_path
|
|
|
|
def get_page_count(
|
|
self,
|
|
document_path: Path,
|
|
mime_type: str,
|
|
) -> int | None:
|
|
"""Return the number of pages in the document.
|
|
|
|
Parameters
|
|
----------
|
|
document_path:
|
|
Absolute path to the source document.
|
|
mime_type:
|
|
Detected MIME type of the document.
|
|
|
|
Returns
|
|
-------
|
|
int | None
|
|
Always None — page count is not meaningful for plain text.
|
|
"""
|
|
return None
|
|
|
|
def extract_metadata(
|
|
self,
|
|
document_path: Path,
|
|
mime_type: str,
|
|
) -> list[MetadataEntry]:
|
|
"""Extract format-specific metadata from the document.
|
|
|
|
Returns
|
|
-------
|
|
list[MetadataEntry]
|
|
Always ``[]`` — plain text files carry no structured metadata.
|
|
"""
|
|
return []
|