From c2b8b22fb46975c4535966682e2c3b39c2ab4845 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 20 Mar 2026 09:22:18 -0700 Subject: [PATCH] Chore: Convert mail parser to plugin style (#12397) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Refactor(mail): rename paperless_mail/parsers.py → paperless/parsers/mail.py Preserve git history for MailDocumentParser by committing the rename separately before editing, following the project convention. Co-Authored-By: Claude Sonnet 4.6 * Refactor(mail): move mail parser tests to paperless/tests/parsers/ Move test_parsers.py → test_mail_parser.py and test_parsers_live.py → test_mail_parser_live.py alongside the other built-in parser tests, preserving git history before editing. Update MailDocumentParser import to the new canonical location. Co-Authored-By: Claude Sonnet 4.6 * Chore: move mail parser sample files to paperless/tests/samples/mail/ Relocate all mail test fixtures from src/paperless_mail/tests/samples/ to src/paperless/tests/samples/mail/ ahead of the parser plugin refactor. Add the new path to the codespell skip list to prevent false-positive spell corrections in binary/fixture email files. Co-Authored-By: Claude Sonnet 4.6 * Feat(tests): add mail parser fixtures to paperless/tests/parsers/conftest.py Add mail_samples_dir, per-file sample fixtures, and mail_parser (context-manager style) to mirror the old paperless_mail conftest but rooted at the new samples/mail/ location. Co-Authored-By: Claude Sonnet 4.6 * Feat(parsers): migrate MailDocumentParser to ParserProtocol Move the mail parser from paperless_mail/parsers.py to paperless/parsers/mail.py and refactor it to implement ParserProtocol: - Class-level name/version/author/url attributes - supported_mime_types() and score() classmethods (score=20) - can_produce_archive=False, requires_pdf_rendition=True - Context manager lifecycle (__enter__/__exit__) - New parse() signature without mailrule_id kwarg; consumer sets parser.mailrule_id before calling parse() instead - get_text()/get_date()/get_archive_path() accessor methods - extract_metadata() returning email headers and attachment info Register MailDocumentParser in the ParserRegistry alongside Text and Tika parsers. Update consumer, signals, and all import sites to use the new location. Update tests to use the new accessor API, patch paths, and context-manager fixture. Co-Authored-By: Claude Sonnet 4.6 * Fix(parsers): pop legacy constructor args in mail signal wrapper MailDocumentParser.__init__ takes no constructor args in the new protocol. Update the get_parser() signal wrapper to pop logging_group and progress_callback (passed by the legacy consumer dispatch path) before instantiating — the same pattern used by TextDocumentParser. Also update test_mail_parser_receives_mailrule to use the real signal wrapper (mail_get_parser) instead of MailDocumentParser directly, so the test exercises the actual dispatch path and matches the new parse() call signature (no mailrule kwarg). Co-Authored-By: Claude Sonnet 4.6 * Bumps this so we can run * Fixes location of the fixture * Removes fixtures which were duplicated * Feat(parsers): add ParserContext and configure() to ParserProtocol Replace the ad-hoc mailrule_id attribute assignment with a typed, immutable ParserContext dataclass and a configure() method on the Protocol: - ParserContext(frozen=True, slots=True) lives in paperless/parsers/ alongside ParserProtocol and MetadataEntry; currently carries only mailrule_id but is designed to grow with output_type, ocr_mode, and ocr_language in a future phase (decoupling parsers from settings.*) - ParserProtocol.configure(context: ParserContext) -> None is the extension point; no-op by default - MailDocumentParser.configure() reads mailrule_id into _mailrule_id - TextDocumentParser and TikaDocumentParser implement a no-op configure() - Consumer calls document_parser.configure(ParserContext(...)) before parse(), replacing the isinstance(parser, MailDocumentParser) guard and the direct attribute mutation Co-Authored-By: Claude Sonnet 4.6 * Feat(parsers): call configure(ParserContext()) in update_document task Apply the same new-style parser shim pattern as the consumer to update_document_content_maybe_archive_file: - Call __enter__ for Text/Tika parsers after instantiation - Call configure(ParserContext()) before parse() for all new-style parsers (mailrule_id is not available here — this is a re-process of an existing document, so the default empty context is correct) - Call parse(path, mime_type) with 2 args for new-style parsers - Call get_thumbnail(path, mime_type) with 2 args for new-style parsers - Call __exit__ instead of cleanup() in the finally block Co-Authored-By: Claude Sonnet 4.6 * Fix(tests): add configure() to DummyParser and missing-method parametrize ParserProtocol now requires configure(context: ParserContext) -> None. Update DummyParser in test_registry.py to implement it, and add 'missing-configure' to the test_partial_compliant_fails_isinstance parametrize list so the new method is covered by the negative test. Co-Authored-By: Claude Sonnet 4.6 * Cleans up the reprocess task and generally reduces duplicate of classes * Corrects the score return * Updates so we can report a page count for these parsers, assuming we do have an archive produced when called * Increases test coverage * One more coverage * Updates typing * Updates typing --------- Co-authored-by: Claude Sonnet 4.6 --- pyproject.toml | 2 +- src/documents/consumer.py | 50 +- src/documents/tasks.py | 60 +- src/documents/tests/test_consumer.py | 20 +- src/paperless/parsers/__init__.py | 55 ++ src/paperless/parsers/mail.py | 834 ++++++++++++++++++ src/paperless/parsers/registry.py | 2 + src/paperless/parsers/remote.py | 4 + src/paperless/parsers/text.py | 4 + src/paperless/parsers/tika.py | 14 +- src/paperless/tests/parsers/conftest.py | 164 ++++ .../tests/parsers/test_mail_parser.py} | 88 +- .../tests/parsers/test_mail_parser_live.py} | 8 +- .../tests/parsers/test_remote_parser.py | 2 + .../tests/parsers/test_text_parser.py | 5 + .../tests/parsers/test_tika_parser.py | 25 + .../tests/samples/mail}/broken.eml | 0 .../tests/samples/mail}/first.pdf | Bin .../tests/samples/mail}/html.eml | 0 .../tests/samples/mail}/html.eml.html | 0 .../tests/samples/mail}/html.eml.pdf | Bin .../tests/samples/mail}/html.eml.pdf.webp | Bin .../tests/samples/mail}/sample.html | 0 .../tests/samples/mail}/sample.html.pdf | Bin .../tests/samples/mail}/sample.html.pdf.webp | Bin .../tests/samples/mail}/sample.png | Bin .../tests/samples/mail}/second.pdf | Bin .../tests/samples/mail}/simple_text.eml | 0 .../tests/samples/mail}/simple_text.eml.pdf | Bin .../samples/mail}/simple_text.eml.pdf.webp | Bin src/paperless/tests/test_registry.py | 7 + src/paperless_mail/parsers.py | 481 ---------- src/paperless_mail/signals.py | 9 +- src/paperless_mail/tests/conftest.py | 70 -- 34 files changed, 1285 insertions(+), 619 deletions(-) create mode 100644 src/paperless/parsers/mail.py rename src/{paperless_mail/tests/test_parsers.py => paperless/tests/parsers/test_mail_parser.py} (89%) rename src/{paperless_mail/tests/test_parsers_live.py => paperless/tests/parsers/test_mail_parser_live.py} (97%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/broken.eml (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/first.pdf (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/html.eml (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/html.eml.html (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/html.eml.pdf (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/html.eml.pdf.webp (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/sample.html (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/sample.html.pdf (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/sample.html.pdf.webp (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/sample.png (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/second.pdf (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/simple_text.eml (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/simple_text.eml.pdf (100%) rename src/{paperless_mail/tests/samples => paperless/tests/samples/mail}/simple_text.eml.pdf.webp (100%) delete mode 100644 src/paperless_mail/parsers.py diff --git a/pyproject.toml b/pyproject.toml index 43ad1e1cc..d41a918c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -256,7 +256,7 @@ lint.isort.force-single-line = true [tool.codespell] write-changes = true ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish" -skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/documents/tests/samples/*,*.po,*.json" +skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples/mail/*,src/documents/tests/samples/*,*.po,*.json" [tool.pytest] minversion = "9.0" diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 11abf46d4..ba2bba473 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -51,10 +51,11 @@ from documents.templating.workflows import parse_w_workflow_placeholders from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.parsers import ParserContext +from paperless.parsers.mail import MailDocumentParser from paperless.parsers.remote import RemoteDocumentParser from paperless.parsers.text import TextDocumentParser from paperless.parsers.tika import TikaDocumentParser -from paperless_mail.parsers import MailDocumentParser LOGGING_NAME: Final[str] = "paperless.consumer" @@ -71,7 +72,12 @@ def _parser_cleanup(parser: DocumentParser) -> None: """ if isinstance( parser, - (TextDocumentParser, RemoteDocumentParser, TikaDocumentParser), + ( + MailDocumentParser, + RemoteDocumentParser, + TextDocumentParser, + TikaDocumentParser, + ), ): parser.__exit__(None, None, None) else: @@ -453,13 +459,20 @@ class ConsumerPlugin( progress_callback=progress_callback, ) + parser_is_new_style = isinstance( + document_parser, + ( + MailDocumentParser, + RemoteDocumentParser, + TextDocumentParser, + TikaDocumentParser, + ), + ) + # New-style parsers use __enter__/__exit__ for resource management. # _parser_cleanup (below) handles __exit__; call __enter__ here. # TODO(stumpylog): Remove me in the future - if isinstance( - document_parser, - (TextDocumentParser, RemoteDocumentParser, TikaDocumentParser), - ): + if parser_is_new_style: document_parser.__enter__() self.log.debug(f"Parser: {type(document_parser).__name__}") @@ -480,20 +493,12 @@ class ConsumerPlugin( ConsumerStatusShortMessage.PARSING_DOCUMENT, ) self.log.debug(f"Parsing {self.filename}...") - if ( - isinstance(document_parser, MailDocumentParser) - and self.input_doc.mailrule_id - ): - document_parser.parse( - self.working_copy, - mime_type, - self.filename, - self.input_doc.mailrule_id, + + # TODO(stumpylog): Remove me in the future when all parsers use new protocol + if parser_is_new_style: + document_parser.configure( + ParserContext(mailrule_id=self.input_doc.mailrule_id), ) - elif isinstance( - document_parser, - (TextDocumentParser, RemoteDocumentParser, TikaDocumentParser), - ): # TODO(stumpylog): Remove me in the future document_parser.parse(self.working_copy, mime_type) else: @@ -506,11 +511,8 @@ class ConsumerPlugin( ProgressStatusOptions.WORKING, ConsumerStatusShortMessage.GENERATING_THUMBNAIL, ) - if isinstance( - document_parser, - (TextDocumentParser, RemoteDocumentParser, TikaDocumentParser), - ): - # TODO(stumpylog): Remove me in the future + # TODO(stumpylog): Remove me in the future when all parsers use new protocol + if parser_is_new_style: thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type) else: thumbnail = document_parser.get_thumbnail( diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 378695731..947da878f 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -65,6 +65,11 @@ from documents.signals.handlers import run_workflows from documents.signals.handlers import send_websocket_document_updated from documents.workflows.utils import get_workflows_for_trigger from paperless.config import AIConfig +from paperless.parsers import ParserContext +from paperless.parsers.mail import MailDocumentParser +from paperless.parsers.remote import RemoteDocumentParser +from paperless.parsers.text import TextDocumentParser +from paperless.parsers.tika import TikaDocumentParser from paperless_ai.indexing import llm_index_add_or_update_document from paperless_ai.indexing import llm_index_remove_document from paperless_ai.indexing import update_llm_index @@ -304,7 +309,9 @@ def update_document_content_maybe_archive_file(document_id) -> None: mime_type = document.mime_type - parser_class: type[DocumentParser] = get_parser_class_for_mime_type(mime_type) + parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type( + mime_type, + ) if not parser_class: logger.error( @@ -315,14 +322,41 @@ def update_document_content_maybe_archive_file(document_id) -> None: parser: DocumentParser = parser_class(logging_group=uuid.uuid4()) - try: - parser.parse(document.source_path, mime_type, document.get_public_filename()) + parser_is_new_style = isinstance( + parser, + ( + MailDocumentParser, + RemoteDocumentParser, + TextDocumentParser, + TikaDocumentParser, + ), + ) - thumbnail = parser.get_thumbnail( - document.source_path, - mime_type, - document.get_public_filename(), - ) + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if parser_is_new_style: + parser.__enter__() + + try: + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if parser_is_new_style: + parser.configure(ParserContext()) + parser.parse(document.source_path, mime_type) + else: + parser.parse( + document.source_path, + mime_type, + document.get_public_filename(), + ) + + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if parser_is_new_style: + thumbnail = parser.get_thumbnail(document.source_path, mime_type) + else: + thumbnail = parser.get_thumbnail( + document.source_path, + mime_type, + document.get_public_filename(), + ) with transaction.atomic(): oldDocument = Document.objects.get(pk=document.pk) @@ -403,8 +437,14 @@ def update_document_content_maybe_archive_file(document_id) -> None: f"Error while parsing document {document} (ID: {document_id})", ) finally: - # TODO(stumpylog): Cleanup once all parsers are handled - parser.cleanup() + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if isinstance( + parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + parser.__exit__(None, None, None) + else: + parser.cleanup() @shared_task diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index a59c7d676..a3574fdce 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -36,7 +36,6 @@ from documents.tests.utils import DummyProgressManager from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import GetConsumerMixin from paperless_mail.models import MailRule -from paperless_mail.parsers import MailDocumentParser class _BaseTestParser(DocumentParser): @@ -1091,7 +1090,7 @@ class TestConsumer( self.assertEqual(command[1], "--replace-input") @mock.patch("paperless_mail.models.MailRule.objects.get") - @mock.patch("paperless_mail.parsers.MailDocumentParser.parse") + @mock.patch("paperless.parsers.mail.MailDocumentParser.parse") @mock.patch("documents.parsers.document_consumer_declaration.send") def test_mail_parser_receives_mailrule( self, @@ -1107,11 +1106,13 @@ class TestConsumer( THEN: - The mail parser should receive the mail rule """ + from paperless_mail.signals import get_parser as mail_get_parser + mock_consumer_declaration_send.return_value = [ ( None, { - "parser": MailDocumentParser, + "parser": mail_get_parser, "mime_types": {"message/rfc822": ".eml"}, "weight": 0, }, @@ -1123,9 +1124,10 @@ class TestConsumer( with self.get_consumer( filepath=( Path(__file__).parent.parent.parent - / Path("paperless_mail") + / Path("paperless") / Path("tests") / Path("samples") + / Path("mail") ).resolve() / "html.eml", source=DocumentSource.MailFetch, @@ -1136,12 +1138,10 @@ class TestConsumer( ConsumerError, ): consumer.run() - mock_mail_parser_parse.assert_called_once_with( - consumer.working_copy, - "message/rfc822", - file_name="sample.pdf", - mailrule=mock_mailrule_get.return_value, - ) + mock_mail_parser_parse.assert_called_once_with( + consumer.working_copy, + "message/rfc822", + ) @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) diff --git a/src/paperless/parsers/__init__.py b/src/paperless/parsers/__init__.py index ea67ade00..c9c1530a5 100644 --- a/src/paperless/parsers/__init__.py +++ b/src/paperless/parsers/__init__.py @@ -35,6 +35,7 @@ Usage example (third-party parser):: from __future__ import annotations +from dataclasses import dataclass from typing import TYPE_CHECKING from typing import Protocol from typing import Self @@ -48,6 +49,7 @@ if TYPE_CHECKING: __all__ = [ "MetadataEntry", + "ParserContext", "ParserProtocol", ] @@ -73,6 +75,44 @@ class MetadataEntry(TypedDict): """String representation of the field value.""" +@dataclass(frozen=True, slots=True) +class ParserContext: + """Immutable context passed to a parser before parse(). + + The consumer assembles this from the ingestion event and Django + settings, then calls ``parser.configure(context)`` before + ``parser.parse()``. Parsers read only the fields relevant to them; + unneeded fields are ignored. + + ``frozen=True`` prevents accidental mutation after the consumer + hands the context off. ``slots=True`` keeps instances lightweight. + + Fields + ------ + mailrule_id : int | None + Primary key of the ``MailRule`` that triggered this ingestion, + or ``None`` when the document did not arrive via a mail rule. + Used by ``MailDocumentParser`` to select the PDF layout. + + Notes + ----- + Future fields (not yet implemented): + + * ``output_type`` — PDF/A variant for archive generation + (replaces ``settings.OCR_OUTPUT_TYPE`` reads inside parsers). + * ``ocr_mode`` — skip-text, redo, force, etc. + (replaces ``settings.OCR_MODE`` reads inside parsers). + * ``ocr_language`` — Tesseract language string. + (replaces ``settings.OCR_LANGUAGE`` reads inside parsers). + + When those fields are added the consumer will read from Django + settings once and populate them here, decoupling parsers from + ``settings.*`` entirely. + """ + + mailrule_id: int | None = None + + @runtime_checkable class ParserProtocol(Protocol): """Structural contract for all Paperless-ngx document parsers. @@ -191,6 +231,21 @@ class ParserProtocol(Protocol): # Core parsing interface # ------------------------------------------------------------------ + def configure(self, context: ParserContext) -> None: + """Apply source context before parse(). + + Called by the consumer after instantiation and before parse(). + The default implementation is a no-op; parsers override only the + fields they need. + + Parameters + ---------- + context: + Immutable context assembled by the consumer for this + specific ingestion event. + """ + ... + def parse( self, document_path: Path, diff --git a/src/paperless/parsers/mail.py b/src/paperless/parsers/mail.py new file mode 100644 index 000000000..9914b2ec6 --- /dev/null +++ b/src/paperless/parsers/mail.py @@ -0,0 +1,834 @@ +""" +Built-in mail document parser. + +Handles message/rfc822 (EML) MIME type by: +- Parsing the email using imap_tools +- Generating a PDF via Gotenberg (for display and archive) +- Extracting text via Tika for HTML content +- Extracting metadata from email headers + +The parser always produces a PDF because EML files cannot be rendered +natively in a browser (requires_pdf_rendition=True). +""" + +from __future__ import annotations + +import logging +import re +import shutil +import tempfile +from html import escape +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Self + +from bleach import clean +from bleach import linkify +from django.conf import settings +from django.utils import timezone +from django.utils.timezone import is_naive +from django.utils.timezone import make_aware +from gotenberg_client import GotenbergClient +from gotenberg_client.constants import A4 +from gotenberg_client.options import Measurement +from gotenberg_client.options import MeasurementUnitType +from gotenberg_client.options import PageMarginsType +from gotenberg_client.options import PdfAFormat +from humanize import naturalsize +from imap_tools import MailAttachment +from imap_tools import MailMessage +from tika_client import TikaClient + +from documents.parsers import ParseError +from documents.parsers import make_thumbnail_from_pdf +from paperless.models import OutputTypeChoices +from paperless.version import __full_version_str__ +from paperless_mail.models import MailRule + +if TYPE_CHECKING: + import datetime + from types import TracebackType + + from paperless.parsers import MetadataEntry + from paperless.parsers import ParserContext + +logger = logging.getLogger("paperless.parsing.mail") + +_SUPPORTED_MIME_TYPES: dict[str, str] = { + "message/rfc822": ".eml", +} + + +class MailDocumentParser: + """Parse .eml email files for Paperless-ngx. + + Uses imap_tools to parse .eml files, generates a PDF using Gotenberg, + and sends the HTML part to a Tika server for text extraction. Because + EML files cannot be rendered natively in a browser, the parser always + produces a PDF rendition (requires_pdf_rendition=True). + + Pass a ``ParserContext`` to ``configure()`` before ``parse()`` to + apply mail-rule-specific PDF layout options: + + parser.configure(ParserContext(mailrule_id=rule.pk)) + parser.parse(path, mime_type) + + Class attributes + ---------------- + name : str + Human-readable parser name. + version : str + Semantic version string, kept in sync with Paperless-ngx releases. + author : str + Maintainer name. + url : str + Issue tracker / source URL. + """ + + name: str = "Paperless-ngx Mail Parser" + version: str = __full_version_str__ + author: str = "Paperless-ngx Contributors" + url: str = "https://github.com/paperless-ngx/paperless-ngx" + + # ------------------------------------------------------------------ + # Class methods + # ------------------------------------------------------------------ + + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + """Return the MIME types this parser handles. + + Returns + ------- + dict[str, str] + Mapping of MIME type to preferred file extension. + """ + return _SUPPORTED_MIME_TYPES + + @classmethod + def score( + cls, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> int | None: + """Return the priority score for handling this file. + + Parameters + ---------- + mime_type: + Detected MIME type of the file. + filename: + Original filename including extension. + path: + Optional filesystem path. Not inspected by this parser. + + Returns + ------- + int | None + 10 if the MIME type is supported, otherwise None. + """ + if mime_type in _SUPPORTED_MIME_TYPES: + return 10 + return None + + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def can_produce_archive(self) -> bool: + """Whether this parser can produce a searchable PDF archive copy. + + Returns + ------- + bool + Always False — the mail parser produces a display PDF + (requires_pdf_rendition=True), not an optional OCR archive. + """ + return False + + @property + def requires_pdf_rendition(self) -> bool: + """Whether the parser must produce a PDF for the frontend to display. + + Returns + ------- + bool + Always True — EML files cannot be rendered natively in a browser, + so a PDF conversion is always required for display. + """ + return True + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def __init__(self, logging_group: object = None) -> None: + settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) + self._tempdir = Path( + tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), + ) + self._text: str | None = None + self._date: datetime.datetime | None = None + self._archive_path: Path | None = None + self._mailrule_id: int | None = None + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + logger.debug("Cleaning up temporary directory %s", self._tempdir) + shutil.rmtree(self._tempdir, ignore_errors=True) + + # ------------------------------------------------------------------ + # Core parsing interface + # ------------------------------------------------------------------ + + def configure(self, context: ParserContext) -> None: + self._mailrule_id = context.mailrule_id + + def parse( + self, + document_path: Path, + mime_type: str, + *, + produce_archive: bool = True, + ) -> None: + """Parse the given .eml into formatted text and a PDF archive. + + Call ``configure(ParserContext(mailrule_id=...))`` before this method + to apply mail-rule-specific PDF layout options. The ``produce_archive`` + flag is accepted for protocol compatibility but is always honoured — + the mail parser always produces a PDF since EML files cannot be + displayed natively. + + Parameters + ---------- + document_path: + Absolute path to the .eml file. + mime_type: + Detected MIME type of the document (should be "message/rfc822"). + produce_archive: + Accepted for protocol compatibility. The PDF rendition is always + produced since EML files cannot be displayed natively in a browser. + + Raises + ------ + documents.parsers.ParseError + If the file cannot be parsed or PDF generation fails. + """ + + def strip_text(text: str) -> str: + """Reduces the spacing of the given text string.""" + text = re.sub(r"\s+", " ", text) + text = re.sub(r"(\n *)+", "\n", text) + return text.strip() + + def build_formatted_text(mail_message: MailMessage) -> str: + """Constructs a formatted string based on the given email.""" + fmt_text = f"Subject: {mail_message.subject}\n\n" + fmt_text += f"From: {mail_message.from_values.full if mail_message.from_values else ''}\n\n" + to_list = [address.full for address in mail_message.to_values] + fmt_text += f"To: {', '.join(to_list)}\n\n" + if mail_message.cc_values: + fmt_text += ( + f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n" + ) + if mail_message.bcc_values: + fmt_text += ( + f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n" + ) + if mail_message.attachments: + att = [] + for a in mail.attachments: + attachment_size = naturalsize(a.size, binary=True, format="%.2f") + att.append( + f"{a.filename} ({attachment_size})", + ) + fmt_text += f"Attachments: {', '.join(att)}\n\n" + + if mail.html: + fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html)) + + fmt_text += f"\n\n{strip_text(mail.text)}" + + return fmt_text + + logger.debug("Parsing file %s into an email", document_path.name) + mail = self.parse_file_to_message(document_path) + + logger.debug("Building formatted text from email") + self._text = build_formatted_text(mail) + + if is_naive(mail.date): + self._date = make_aware(mail.date) + else: + self._date = mail.date + + logger.debug("Creating a PDF from the email") + if self._mailrule_id: + rule = MailRule.objects.get(pk=self._mailrule_id) + self._archive_path = self.generate_pdf( + mail, + MailRule.PdfLayout(rule.pdf_layout), + ) + else: + self._archive_path = self.generate_pdf(mail) + + # ------------------------------------------------------------------ + # Result accessors + # ------------------------------------------------------------------ + + def get_text(self) -> str | None: + """Return the plain-text content extracted during parse. + + Returns + ------- + str | None + Extracted text, or None if parse has not been called yet. + """ + return self._text + + def get_date(self) -> datetime.datetime | None: + """Return the document date detected during parse. + + Returns + ------- + datetime.datetime | None + Date from the email headers, or None if not detected. + """ + return self._date + + def get_archive_path(self) -> Path | None: + """Return the path to the generated archive PDF, or None. + + Returns + ------- + Path | None + Path to the PDF produced by Gotenberg, or None if parse has not + been called yet. + """ + return self._archive_path + + # ------------------------------------------------------------------ + # Thumbnail and metadata + # ------------------------------------------------------------------ + + def get_thumbnail( + self, + document_path: Path, + mime_type: str, + file_name: str | None = None, + ) -> Path: + """Generate a thumbnail from the PDF rendition of the email. + + Converts the document to PDF first if not already done. + + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + file_name: + Kept for backward compatibility; not used. + + Returns + ------- + Path + Path to the generated WebP thumbnail inside the temporary directory. + """ + if not self._archive_path: + self._archive_path = self.generate_pdf( + self.parse_file_to_message(document_path), + ) + + return make_thumbnail_from_pdf( + self._archive_path, + self._tempdir, + ) + + def get_page_count( + self, + document_path: Path, + mime_type: str, + ) -> int | None: + """Return the number of pages in the document. + + Counts pages in the archive PDF produced by a preceding parse() + call. Returns ``None`` if parse() has not been called yet or if + no archive was produced. + + Returns + ------- + int | None + Page count of the archive PDF, or ``None``. + """ + if self._archive_path is not None: + from paperless.parsers.utils import get_page_count_for_pdf + + return get_page_count_for_pdf(self._archive_path, log=logger) + return None + + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list[MetadataEntry]: + """Extract metadata from the email headers. + + Returns email headers as metadata entries with prefix "header", + plus summary entries for attachments and date. + + Returns + ------- + list[MetadataEntry] + Sorted list of metadata entries, or ``[]`` on parse failure. + """ + result: list[MetadataEntry] = [] + + try: + mail = self.parse_file_to_message(document_path) + except ParseError as e: + logger.warning( + "Error while fetching document metadata for %s: %s", + document_path, + e, + ) + return result + + for key, header_values in mail.headers.items(): + value = ", ".join(header_values) + try: + value.encode("utf-8") + except UnicodeEncodeError as e: # pragma: no cover + logger.debug("Skipping header %s: %s", key, e) + continue + + result.append( + { + "namespace": "", + "prefix": "header", + "key": key, + "value": value, + }, + ) + + result.append( + { + "namespace": "", + "prefix": "", + "key": "attachments", + "value": ", ".join( + f"{attachment.filename}" + f"({naturalsize(attachment.size, binary=True, format='%.2f')})" + for attachment in mail.attachments + ), + }, + ) + + result.append( + { + "namespace": "", + "prefix": "", + "key": "date", + "value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"), + }, + ) + + result.sort(key=lambda item: (item["prefix"], item["key"])) + return result + + # ------------------------------------------------------------------ + # Email-specific methods + # ------------------------------------------------------------------ + + def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None: + """Convert the OCR output type setting to a Gotenberg PdfAFormat.""" + if settings.OCR_OUTPUT_TYPE in { + OutputTypeChoices.PDF_A, + OutputTypeChoices.PDF_A2, + }: + return PdfAFormat.A2b + elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover + logger.warning( + "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead", + ) + return PdfAFormat.A2b + elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover + return PdfAFormat.A3b + return None + + @staticmethod + def parse_file_to_message(filepath: Path) -> MailMessage: + """Parse the given .eml file into a MailMessage object. + + Parameters + ---------- + filepath: + Path to the .eml file. + + Returns + ------- + MailMessage + Parsed mail message. + + Raises + ------ + documents.parsers.ParseError + If the file cannot be parsed or is missing required fields. + """ + try: + with filepath.open("rb") as eml: + parsed = MailMessage.from_bytes(eml.read()) + if parsed.from_values is None: + raise ParseError( + f"Could not parse {filepath}: Missing 'from'", + ) + except Exception as err: + raise ParseError( + f"Could not parse {filepath}: {err}", + ) from err + + return parsed + + def tika_parse(self, html: str) -> str: + """Send HTML content to the Tika server for text extraction. + + Parameters + ---------- + html: + HTML string to parse. + + Returns + ------- + str + Extracted plain text. + + Raises + ------ + documents.parsers.ParseError + If the Tika server cannot be reached or returns an error. + """ + logger.info("Sending content to Tika server") + + try: + with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: + parsed = client.tika.as_text.from_buffer(html, "text/html") + + if parsed.content is not None: + return parsed.content.strip() + return "" + except Exception as err: + raise ParseError( + f"Could not parse content with tika server at " + f"{settings.TIKA_ENDPOINT}: {err}", + ) from err + + def generate_pdf( + self, + mail_message: MailMessage, + pdf_layout: MailRule.PdfLayout | None = None, + ) -> Path: + """Generate a PDF from the email message. + + Creates separate PDFs for the email body and HTML content, then + merges them according to the requested layout. + + Parameters + ---------- + mail_message: + Parsed email message. + pdf_layout: + Layout option for the PDF. Falls back to the + EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided. + + Returns + ------- + Path + Path to the generated PDF inside the temporary directory. + """ + archive_path = Path(self._tempdir) / "merged.pdf" + + mail_pdf_file = self.generate_pdf_from_mail(mail_message) + + if pdf_layout is None: + pdf_layout = MailRule.PdfLayout(settings.EMAIL_PARSE_DEFAULT_LAYOUT) + + # If no HTML content, create the PDF from the message. + # Otherwise, create 2 PDFs and merge them with Gotenberg. + if not mail_message.html: + archive_path.write_bytes(mail_pdf_file.read_bytes()) + else: + pdf_of_html_content = self.generate_pdf_from_html( + mail_message.html, + mail_message.attachments, + ) + + logger.debug("Merging email text and HTML content into single PDF") + + with ( + GotenbergClient( + host=settings.TIKA_GOTENBERG_ENDPOINT, + timeout=settings.CELERY_TASK_TIME_LIMIT, + ) as client, + client.merge.merge() as route, + ): + # Configure requested PDF/A formatting, if any + pdf_a_format = self._settings_to_gotenberg_pdfa() + if pdf_a_format is not None: + route.pdf_format(pdf_a_format) + + match pdf_layout: + case MailRule.PdfLayout.HTML_TEXT: + route.merge([pdf_of_html_content, mail_pdf_file]) + case MailRule.PdfLayout.HTML_ONLY: + route.merge([pdf_of_html_content]) + case MailRule.PdfLayout.TEXT_ONLY: + route.merge([mail_pdf_file]) + case MailRule.PdfLayout.TEXT_HTML | _: + route.merge([mail_pdf_file, pdf_of_html_content]) + + try: + response = route.run() + archive_path.write_bytes(response.content) + except Exception as err: + raise ParseError( + f"Error while merging email HTML into PDF: {err}", + ) from err + + return archive_path + + def mail_to_html(self, mail: MailMessage) -> Path: + """Convert the given email into an HTML file using a template. + + Parameters + ---------- + mail: + Parsed mail message. + + Returns + ------- + Path + Path to the rendered HTML file inside the temporary directory. + """ + + def clean_html(text: str) -> str: + """Attempt to clean, escape, and linkify the given HTML string.""" + if isinstance(text, list): + text = "\n".join([str(e) for e in text]) + if not isinstance(text, str): + text = str(text) + text = escape(text) + text = clean(text) + text = linkify(text, parse_email=True) + text = text.replace("\n", "
") + return text + + data = {} + + data["subject"] = clean_html(mail.subject) + if data["subject"]: + data["subject_label"] = "Subject" + data["from"] = clean_html(mail.from_values.full if mail.from_values else "") + if data["from"]: + data["from_label"] = "From" + data["to"] = clean_html(", ".join(address.full for address in mail.to_values)) + if data["to"]: + data["to_label"] = "To" + data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values)) + if data["cc"]: + data["cc_label"] = "CC" + data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values)) + if data["bcc"]: + data["bcc_label"] = "BCC" + + att = [] + for a in mail.attachments: + att.append( + f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})", + ) + data["attachments"] = clean_html(", ".join(att)) + if data["attachments"]: + data["attachments_label"] = "Attachments" + + data["date"] = clean_html( + timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"), + ) + data["content"] = clean_html(mail.text.strip()) + + from django.template.loader import render_to_string + + html_file = Path(self._tempdir) / "email_as_html.html" + html_file.write_text(render_to_string("email_msg_template.html", context=data)) + + return html_file + + def generate_pdf_from_mail(self, mail: MailMessage) -> Path: + """Create a PDF from the email body using an HTML template and Gotenberg. + + Parameters + ---------- + mail: + Parsed mail message. + + Returns + ------- + Path + Path to the generated PDF inside the temporary directory. + + Raises + ------ + documents.parsers.ParseError + If Gotenberg returns an error. + """ + logger.info("Converting mail to PDF") + + css_file = ( + Path(__file__).parent.parent.parent + / "paperless_mail" + / "templates" + / "output.css" + ) + email_html_file = self.mail_to_html(mail) + + with ( + GotenbergClient( + host=settings.TIKA_GOTENBERG_ENDPOINT, + timeout=settings.CELERY_TASK_TIME_LIMIT, + ) as client, + client.chromium.html_to_pdf() as route, + ): + # Configure requested PDF/A formatting, if any + pdf_a_format = self._settings_to_gotenberg_pdfa() + if pdf_a_format is not None: + route.pdf_format(pdf_a_format) + + try: + response = ( + route.index(email_html_file) + .resource(css_file) + .margins( + PageMarginsType( + top=Measurement(0.1, MeasurementUnitType.Inches), + bottom=Measurement(0.1, MeasurementUnitType.Inches), + left=Measurement(0.1, MeasurementUnitType.Inches), + right=Measurement(0.1, MeasurementUnitType.Inches), + ), + ) + .size(A4) + .scale(1.0) + .run() + ) + except Exception as err: + raise ParseError( + f"Error while converting email to PDF: {err}", + ) from err + + email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf" + email_as_pdf_file.write_bytes(response.content) + + return email_as_pdf_file + + def generate_pdf_from_html( + self, + orig_html: str, + attachments: list[MailAttachment], + ) -> Path: + """Generate a PDF from the HTML content of the email. + + Parameters + ---------- + orig_html: + Raw HTML string from the email body. + attachments: + List of email attachments (used as inline resources). + + Returns + ------- + Path + Path to the generated PDF inside the temporary directory. + + Raises + ------ + documents.parsers.ParseError + If Gotenberg returns an error. + """ + + def clean_html_script(text: str) -> str: + compiled_open = re.compile(re.escape("