diff --git a/pyproject.toml b/pyproject.toml
index 43ad1e1cc..d41a918c0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -256,7 +256,7 @@ lint.isort.force-single-line = true
[tool.codespell]
write-changes = true
ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
-skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/documents/tests/samples/*,*.po,*.json"
+skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples/mail/*,src/documents/tests/samples/*,*.po,*.json"
[tool.pytest]
minversion = "9.0"
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 11abf46d4..ba2bba473 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -51,10 +51,11 @@ from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
+from paperless.parsers import ParserContext
+from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.remote import RemoteDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
-from paperless_mail.parsers import MailDocumentParser
LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -71,7 +72,12 @@ def _parser_cleanup(parser: DocumentParser) -> None:
"""
if isinstance(
parser,
- (TextDocumentParser, RemoteDocumentParser, TikaDocumentParser),
+ (
+ MailDocumentParser,
+ RemoteDocumentParser,
+ TextDocumentParser,
+ TikaDocumentParser,
+ ),
):
parser.__exit__(None, None, None)
else:
@@ -453,13 +459,20 @@ class ConsumerPlugin(
progress_callback=progress_callback,
)
+ parser_is_new_style = isinstance(
+ document_parser,
+ (
+ MailDocumentParser,
+ RemoteDocumentParser,
+ TextDocumentParser,
+ TikaDocumentParser,
+ ),
+ )
+
# New-style parsers use __enter__/__exit__ for resource management.
# _parser_cleanup (below) handles __exit__; call __enter__ here.
# TODO(stumpylog): Remove me in the future
- if isinstance(
- document_parser,
- (TextDocumentParser, RemoteDocumentParser, TikaDocumentParser),
- ):
+ if parser_is_new_style:
document_parser.__enter__()
self.log.debug(f"Parser: {type(document_parser).__name__}")
@@ -480,20 +493,12 @@ class ConsumerPlugin(
ConsumerStatusShortMessage.PARSING_DOCUMENT,
)
self.log.debug(f"Parsing {self.filename}...")
- if (
- isinstance(document_parser, MailDocumentParser)
- and self.input_doc.mailrule_id
- ):
- document_parser.parse(
- self.working_copy,
- mime_type,
- self.filename,
- self.input_doc.mailrule_id,
+
+ # TODO(stumpylog): Remove me in the future when all parsers use new protocol
+ if parser_is_new_style:
+ document_parser.configure(
+ ParserContext(mailrule_id=self.input_doc.mailrule_id),
)
- elif isinstance(
- document_parser,
- (TextDocumentParser, RemoteDocumentParser, TikaDocumentParser),
- ):
# TODO(stumpylog): Remove me in the future
document_parser.parse(self.working_copy, mime_type)
else:
@@ -506,11 +511,8 @@ class ConsumerPlugin(
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
)
- if isinstance(
- document_parser,
- (TextDocumentParser, RemoteDocumentParser, TikaDocumentParser),
- ):
- # TODO(stumpylog): Remove me in the future
+ # TODO(stumpylog): Remove me in the future when all parsers use new protocol
+ if parser_is_new_style:
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
else:
thumbnail = document_parser.get_thumbnail(
diff --git a/src/documents/tasks.py b/src/documents/tasks.py
index 378695731..947da878f 100644
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -65,6 +65,11 @@ from documents.signals.handlers import run_workflows
from documents.signals.handlers import send_websocket_document_updated
from documents.workflows.utils import get_workflows_for_trigger
from paperless.config import AIConfig
+from paperless.parsers import ParserContext
+from paperless.parsers.mail import MailDocumentParser
+from paperless.parsers.remote import RemoteDocumentParser
+from paperless.parsers.text import TextDocumentParser
+from paperless.parsers.tika import TikaDocumentParser
from paperless_ai.indexing import llm_index_add_or_update_document
from paperless_ai.indexing import llm_index_remove_document
from paperless_ai.indexing import update_llm_index
@@ -304,7 +309,9 @@ def update_document_content_maybe_archive_file(document_id) -> None:
mime_type = document.mime_type
- parser_class: type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
+ parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
+ mime_type,
+ )
if not parser_class:
logger.error(
@@ -315,14 +322,41 @@ def update_document_content_maybe_archive_file(document_id) -> None:
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
- try:
- parser.parse(document.source_path, mime_type, document.get_public_filename())
+ parser_is_new_style = isinstance(
+ parser,
+ (
+ MailDocumentParser,
+ RemoteDocumentParser,
+ TextDocumentParser,
+ TikaDocumentParser,
+ ),
+ )
- thumbnail = parser.get_thumbnail(
- document.source_path,
- mime_type,
- document.get_public_filename(),
- )
+ # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
+ if parser_is_new_style:
+ parser.__enter__()
+
+ try:
+ # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
+ if parser_is_new_style:
+ parser.configure(ParserContext())
+ parser.parse(document.source_path, mime_type)
+ else:
+ parser.parse(
+ document.source_path,
+ mime_type,
+ document.get_public_filename(),
+ )
+
+ # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
+ if parser_is_new_style:
+ thumbnail = parser.get_thumbnail(document.source_path, mime_type)
+ else:
+ thumbnail = parser.get_thumbnail(
+ document.source_path,
+ mime_type,
+ document.get_public_filename(),
+ )
with transaction.atomic():
oldDocument = Document.objects.get(pk=document.pk)
@@ -403,8 +437,14 @@ def update_document_content_maybe_archive_file(document_id) -> None:
f"Error while parsing document {document} (ID: {document_id})",
)
finally:
- # TODO(stumpylog): Cleanup once all parsers are handled
- parser.cleanup()
+ # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
+ if isinstance(
+ parser,
+ (MailDocumentParser, TextDocumentParser, TikaDocumentParser),
+ ):
+ parser.__exit__(None, None, None)
+ else:
+ parser.cleanup()
@shared_task
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index a59c7d676..a3574fdce 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -36,7 +36,6 @@ from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import GetConsumerMixin
from paperless_mail.models import MailRule
-from paperless_mail.parsers import MailDocumentParser
class _BaseTestParser(DocumentParser):
@@ -1091,7 +1090,7 @@ class TestConsumer(
self.assertEqual(command[1], "--replace-input")
@mock.patch("paperless_mail.models.MailRule.objects.get")
- @mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
+ @mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_mail_parser_receives_mailrule(
self,
@@ -1107,11 +1106,13 @@ class TestConsumer(
THEN:
- The mail parser should receive the mail rule
"""
+ from paperless_mail.signals import get_parser as mail_get_parser
+
mock_consumer_declaration_send.return_value = [
(
None,
{
- "parser": MailDocumentParser,
+ "parser": mail_get_parser,
"mime_types": {"message/rfc822": ".eml"},
"weight": 0,
},
@@ -1123,9 +1124,10 @@ class TestConsumer(
with self.get_consumer(
filepath=(
Path(__file__).parent.parent.parent
- / Path("paperless_mail")
+ / Path("paperless")
/ Path("tests")
/ Path("samples")
+ / Path("mail")
).resolve()
/ "html.eml",
source=DocumentSource.MailFetch,
@@ -1136,12 +1138,10 @@ class TestConsumer(
ConsumerError,
):
consumer.run()
- mock_mail_parser_parse.assert_called_once_with(
- consumer.working_copy,
- "message/rfc822",
- file_name="sample.pdf",
- mailrule=mock_mailrule_get.return_value,
- )
+ mock_mail_parser_parse.assert_called_once_with(
+ consumer.working_copy,
+ "message/rfc822",
+ )
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
diff --git a/src/paperless/parsers/__init__.py b/src/paperless/parsers/__init__.py
index ea67ade00..c9c1530a5 100644
--- a/src/paperless/parsers/__init__.py
+++ b/src/paperless/parsers/__init__.py
@@ -35,6 +35,7 @@ Usage example (third-party parser)::
from __future__ import annotations
+from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Protocol
from typing import Self
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
__all__ = [
"MetadataEntry",
+ "ParserContext",
"ParserProtocol",
]
@@ -73,6 +75,44 @@ class MetadataEntry(TypedDict):
"""String representation of the field value."""
+@dataclass(frozen=True, slots=True)
+class ParserContext:
+ """Immutable context passed to a parser before parse().
+
+ The consumer assembles this from the ingestion event and Django
+ settings, then calls ``parser.configure(context)`` before
+ ``parser.parse()``. Parsers read only the fields relevant to them;
+ unneeded fields are ignored.
+
+ ``frozen=True`` prevents accidental mutation after the consumer
+ hands the context off. ``slots=True`` keeps instances lightweight.
+
+ Fields
+ ------
+ mailrule_id : int | None
+ Primary key of the ``MailRule`` that triggered this ingestion,
+ or ``None`` when the document did not arrive via a mail rule.
+ Used by ``MailDocumentParser`` to select the PDF layout.
+
+ Notes
+ -----
+ Future fields (not yet implemented):
+
+ * ``output_type`` — PDF/A variant for archive generation
+ (replaces ``settings.OCR_OUTPUT_TYPE`` reads inside parsers).
+ * ``ocr_mode`` — skip-text, redo, force, etc.
+ (replaces ``settings.OCR_MODE`` reads inside parsers).
+ * ``ocr_language`` — Tesseract language string.
+ (replaces ``settings.OCR_LANGUAGE`` reads inside parsers).
+
+ When those fields are added the consumer will read from Django
+ settings once and populate them here, decoupling parsers from
+ ``settings.*`` entirely.
+ """
+
+ mailrule_id: int | None = None
+
+
@runtime_checkable
class ParserProtocol(Protocol):
"""Structural contract for all Paperless-ngx document parsers.
@@ -191,6 +231,21 @@ class ParserProtocol(Protocol):
# Core parsing interface
# ------------------------------------------------------------------
+ def configure(self, context: ParserContext) -> None:
+ """Apply source context before parse().
+
+ Called by the consumer after instantiation and before parse().
+ The default implementation is a no-op; parsers override only the
+ fields they need.
+
+ Parameters
+ ----------
+ context:
+ Immutable context assembled by the consumer for this
+ specific ingestion event.
+ """
+ ...
+
def parse(
self,
document_path: Path,
diff --git a/src/paperless/parsers/mail.py b/src/paperless/parsers/mail.py
new file mode 100644
index 000000000..9914b2ec6
--- /dev/null
+++ b/src/paperless/parsers/mail.py
@@ -0,0 +1,834 @@
+"""
+Built-in mail document parser.
+
+Handles message/rfc822 (EML) MIME type by:
+- Parsing the email using imap_tools
+- Generating a PDF via Gotenberg (for display and archive)
+- Extracting text via Tika for HTML content
+- Extracting metadata from email headers
+
+The parser always produces a PDF because EML files cannot be rendered
+natively in a browser (requires_pdf_rendition=True).
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import shutil
+import tempfile
+from html import escape
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Self
+
+from bleach import clean
+from bleach import linkify
+from django.conf import settings
+from django.utils import timezone
+from django.utils.timezone import is_naive
+from django.utils.timezone import make_aware
+from gotenberg_client import GotenbergClient
+from gotenberg_client.constants import A4
+from gotenberg_client.options import Measurement
+from gotenberg_client.options import MeasurementUnitType
+from gotenberg_client.options import PageMarginsType
+from gotenberg_client.options import PdfAFormat
+from humanize import naturalsize
+from imap_tools import MailAttachment
+from imap_tools import MailMessage
+from tika_client import TikaClient
+
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from paperless.models import OutputTypeChoices
+from paperless.version import __full_version_str__
+from paperless_mail.models import MailRule
+
+if TYPE_CHECKING:
+ import datetime
+ from types import TracebackType
+
+ from paperless.parsers import MetadataEntry
+ from paperless.parsers import ParserContext
+
+logger = logging.getLogger("paperless.parsing.mail")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+ "message/rfc822": ".eml",
+}
+
+
+class MailDocumentParser:
+ """Parse .eml email files for Paperless-ngx.
+
+ Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
+ and sends the HTML part to a Tika server for text extraction. Because
+ EML files cannot be rendered natively in a browser, the parser always
+ produces a PDF rendition (requires_pdf_rendition=True).
+
+ Pass a ``ParserContext`` to ``configure()`` before ``parse()`` to
+ apply mail-rule-specific PDF layout options:
+
+ parser.configure(ParserContext(mailrule_id=rule.pk))
+ parser.parse(path, mime_type)
+
+ Class attributes
+ ----------------
+ name : str
+ Human-readable parser name.
+ version : str
+ Semantic version string, kept in sync with Paperless-ngx releases.
+ author : str
+ Maintainer name.
+ url : str
+ Issue tracker / source URL.
+ """
+
+ name: str = "Paperless-ngx Mail Parser"
+ version: str = __full_version_str__
+ author: str = "Paperless-ngx Contributors"
+ url: str = "https://github.com/paperless-ngx/paperless-ngx"
+
+ # ------------------------------------------------------------------
+ # Class methods
+ # ------------------------------------------------------------------
+
+ @classmethod
+ def supported_mime_types(cls) -> dict[str, str]:
+ """Return the MIME types this parser handles.
+
+ Returns
+ -------
+ dict[str, str]
+ Mapping of MIME type to preferred file extension.
+ """
+ return _SUPPORTED_MIME_TYPES
+
+ @classmethod
+ def score(
+ cls,
+ mime_type: str,
+ filename: str,
+ path: Path | None = None,
+ ) -> int | None:
+ """Return the priority score for handling this file.
+
+ Parameters
+ ----------
+ mime_type:
+ Detected MIME type of the file.
+ filename:
+ Original filename including extension.
+ path:
+ Optional filesystem path. Not inspected by this parser.
+
+ Returns
+ -------
+ int | None
+ 10 if the MIME type is supported, otherwise None.
+ """
+ if mime_type in _SUPPORTED_MIME_TYPES:
+ return 10
+ return None
+
+ # ------------------------------------------------------------------
+ # Properties
+ # ------------------------------------------------------------------
+
+ @property
+ def can_produce_archive(self) -> bool:
+ """Whether this parser can produce a searchable PDF archive copy.
+
+ Returns
+ -------
+ bool
+ Always False — the mail parser produces a display PDF
+ (requires_pdf_rendition=True), not an optional OCR archive.
+ """
+ return False
+
+ @property
+ def requires_pdf_rendition(self) -> bool:
+ """Whether the parser must produce a PDF for the frontend to display.
+
+ Returns
+ -------
+ bool
+ Always True — EML files cannot be rendered natively in a browser,
+ so a PDF conversion is always required for display.
+ """
+ return True
+
+ # ------------------------------------------------------------------
+ # Lifecycle
+ # ------------------------------------------------------------------
+
+ def __init__(self, logging_group: object = None) -> None:
+ settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+ self._tempdir = Path(
+ tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+ )
+ self._text: str | None = None
+ self._date: datetime.datetime | None = None
+ self._archive_path: Path | None = None
+ self._mailrule_id: int | None = None
+
+ def __enter__(self) -> Self:
+ return self
+
+ def __exit__(
+ self,
+ exc_type: type[BaseException] | None,
+ exc_val: BaseException | None,
+ exc_tb: TracebackType | None,
+ ) -> None:
+ logger.debug("Cleaning up temporary directory %s", self._tempdir)
+ shutil.rmtree(self._tempdir, ignore_errors=True)
+
+ # ------------------------------------------------------------------
+ # Core parsing interface
+ # ------------------------------------------------------------------
+
+ def configure(self, context: ParserContext) -> None:
+ self._mailrule_id = context.mailrule_id
+
+ def parse(
+ self,
+ document_path: Path,
+ mime_type: str,
+ *,
+ produce_archive: bool = True,
+ ) -> None:
+ """Parse the given .eml into formatted text and a PDF archive.
+
+ Call ``configure(ParserContext(mailrule_id=...))`` before this method
+ to apply mail-rule-specific PDF layout options. The ``produce_archive``
+ flag is accepted for protocol compatibility but is always honoured —
+ the mail parser always produces a PDF since EML files cannot be
+ displayed natively.
+
+ Parameters
+ ----------
+ document_path:
+ Absolute path to the .eml file.
+ mime_type:
+ Detected MIME type of the document (should be "message/rfc822").
+ produce_archive:
+ Accepted for protocol compatibility. The PDF rendition is always
+ produced since EML files cannot be displayed natively in a browser.
+
+ Raises
+ ------
+ documents.parsers.ParseError
+ If the file cannot be parsed or PDF generation fails.
+ """
+
+ def strip_text(text: str) -> str:
+ """Reduces the spacing of the given text string."""
+ text = re.sub(r"\s+", " ", text)
+ text = re.sub(r"(\n *)+", "\n", text)
+ return text.strip()
+
+ def build_formatted_text(mail_message: MailMessage) -> str:
+ """Constructs a formatted string based on the given email."""
+ fmt_text = f"Subject: {mail_message.subject}\n\n"
+ fmt_text += f"From: {mail_message.from_values.full if mail_message.from_values else ''}\n\n"
+ to_list = [address.full for address in mail_message.to_values]
+ fmt_text += f"To: {', '.join(to_list)}\n\n"
+ if mail_message.cc_values:
+ fmt_text += (
+ f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
+ )
+ if mail_message.bcc_values:
+ fmt_text += (
+ f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
+ )
+ if mail_message.attachments:
+ att = []
+ for a in mail.attachments:
+ attachment_size = naturalsize(a.size, binary=True, format="%.2f")
+ att.append(
+ f"{a.filename} ({attachment_size})",
+ )
+ fmt_text += f"Attachments: {', '.join(att)}\n\n"
+
+ if mail.html:
+ fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
+
+ fmt_text += f"\n\n{strip_text(mail.text)}"
+
+ return fmt_text
+
+ logger.debug("Parsing file %s into an email", document_path.name)
+ mail = self.parse_file_to_message(document_path)
+
+ logger.debug("Building formatted text from email")
+ self._text = build_formatted_text(mail)
+
+ if is_naive(mail.date):
+ self._date = make_aware(mail.date)
+ else:
+ self._date = mail.date
+
+ logger.debug("Creating a PDF from the email")
+ if self._mailrule_id:
+ rule = MailRule.objects.get(pk=self._mailrule_id)
+ self._archive_path = self.generate_pdf(
+ mail,
+ MailRule.PdfLayout(rule.pdf_layout),
+ )
+ else:
+ self._archive_path = self.generate_pdf(mail)
+
+ # ------------------------------------------------------------------
+ # Result accessors
+ # ------------------------------------------------------------------
+
+ def get_text(self) -> str | None:
+ """Return the plain-text content extracted during parse.
+
+ Returns
+ -------
+ str | None
+ Extracted text, or None if parse has not been called yet.
+ """
+ return self._text
+
+ def get_date(self) -> datetime.datetime | None:
+ """Return the document date detected during parse.
+
+ Returns
+ -------
+ datetime.datetime | None
+ Date from the email headers, or None if not detected.
+ """
+ return self._date
+
+ def get_archive_path(self) -> Path | None:
+ """Return the path to the generated archive PDF, or None.
+
+ Returns
+ -------
+ Path | None
+ Path to the PDF produced by Gotenberg, or None if parse has not
+ been called yet.
+ """
+ return self._archive_path
+
+ # ------------------------------------------------------------------
+ # Thumbnail and metadata
+ # ------------------------------------------------------------------
+
+ def get_thumbnail(
+ self,
+ document_path: Path,
+ mime_type: str,
+ file_name: str | None = None,
+ ) -> Path:
+ """Generate a thumbnail from the PDF rendition of the email.
+
+ Converts the document to PDF first if not already done.
+
+ Parameters
+ ----------
+ document_path:
+ Absolute path to the source document.
+ mime_type:
+ Detected MIME type of the document.
+ file_name:
+ Kept for backward compatibility; not used.
+
+ Returns
+ -------
+ Path
+ Path to the generated WebP thumbnail inside the temporary directory.
+ """
+ if not self._archive_path:
+ self._archive_path = self.generate_pdf(
+ self.parse_file_to_message(document_path),
+ )
+
+ return make_thumbnail_from_pdf(
+ self._archive_path,
+ self._tempdir,
+ )
+
+ def get_page_count(
+ self,
+ document_path: Path,
+ mime_type: str,
+ ) -> int | None:
+ """Return the number of pages in the document.
+
+ Counts pages in the archive PDF produced by a preceding parse()
+ call. Returns ``None`` if parse() has not been called yet or if
+ no archive was produced.
+
+ Returns
+ -------
+ int | None
+ Page count of the archive PDF, or ``None``.
+ """
+ if self._archive_path is not None:
+ from paperless.parsers.utils import get_page_count_for_pdf
+
+ return get_page_count_for_pdf(self._archive_path, log=logger)
+ return None
+
+ def extract_metadata(
+ self,
+ document_path: Path,
+ mime_type: str,
+ ) -> list[MetadataEntry]:
+ """Extract metadata from the email headers.
+
+ Returns email headers as metadata entries with prefix "header",
+ plus summary entries for attachments and date.
+
+ Returns
+ -------
+ list[MetadataEntry]
+ Sorted list of metadata entries, or ``[]`` on parse failure.
+ """
+ result: list[MetadataEntry] = []
+
+ try:
+ mail = self.parse_file_to_message(document_path)
+ except ParseError as e:
+ logger.warning(
+ "Error while fetching document metadata for %s: %s",
+ document_path,
+ e,
+ )
+ return result
+
+ for key, header_values in mail.headers.items():
+ value = ", ".join(header_values)
+ try:
+ value.encode("utf-8")
+ except UnicodeEncodeError as e: # pragma: no cover
+ logger.debug("Skipping header %s: %s", key, e)
+ continue
+
+ result.append(
+ {
+ "namespace": "",
+ "prefix": "header",
+ "key": key,
+ "value": value,
+ },
+ )
+
+ result.append(
+ {
+ "namespace": "",
+ "prefix": "",
+ "key": "attachments",
+ "value": ", ".join(
+ f"{attachment.filename}"
+ f"({naturalsize(attachment.size, binary=True, format='%.2f')})"
+ for attachment in mail.attachments
+ ),
+ },
+ )
+
+ result.append(
+ {
+ "namespace": "",
+ "prefix": "",
+ "key": "date",
+ "value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
+ },
+ )
+
+ result.sort(key=lambda item: (item["prefix"], item["key"]))
+ return result
+
+ # ------------------------------------------------------------------
+ # Email-specific methods
+ # ------------------------------------------------------------------
+
+ def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
+ """Convert the OCR output type setting to a Gotenberg PdfAFormat."""
+ if settings.OCR_OUTPUT_TYPE in {
+ OutputTypeChoices.PDF_A,
+ OutputTypeChoices.PDF_A2,
+ }:
+ return PdfAFormat.A2b
+ elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
+ logger.warning(
+ "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
+ )
+ return PdfAFormat.A2b
+ elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
+ return PdfAFormat.A3b
+ return None
+
+ @staticmethod
+ def parse_file_to_message(filepath: Path) -> MailMessage:
+ """Parse the given .eml file into a MailMessage object.
+
+ Parameters
+ ----------
+ filepath:
+ Path to the .eml file.
+
+ Returns
+ -------
+ MailMessage
+ Parsed mail message.
+
+ Raises
+ ------
+ documents.parsers.ParseError
+ If the file cannot be parsed or is missing required fields.
+ """
+ try:
+ with filepath.open("rb") as eml:
+ parsed = MailMessage.from_bytes(eml.read())
+ if parsed.from_values is None:
+ raise ParseError(
+ f"Could not parse {filepath}: Missing 'from'",
+ )
+ except Exception as err:
+ raise ParseError(
+ f"Could not parse {filepath}: {err}",
+ ) from err
+
+ return parsed
+
+ def tika_parse(self, html: str) -> str:
+ """Send HTML content to the Tika server for text extraction.
+
+ Parameters
+ ----------
+ html:
+ HTML string to parse.
+
+ Returns
+ -------
+ str
+ Extracted plain text.
+
+ Raises
+ ------
+ documents.parsers.ParseError
+ If the Tika server cannot be reached or returns an error.
+ """
+ logger.info("Sending content to Tika server")
+
+ try:
+ with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
+ parsed = client.tika.as_text.from_buffer(html, "text/html")
+
+ if parsed.content is not None:
+ return parsed.content.strip()
+ return ""
+ except Exception as err:
+ raise ParseError(
+ f"Could not parse content with tika server at "
+ f"{settings.TIKA_ENDPOINT}: {err}",
+ ) from err
+
+ def generate_pdf(
+ self,
+ mail_message: MailMessage,
+ pdf_layout: MailRule.PdfLayout | None = None,
+ ) -> Path:
+ """Generate a PDF from the email message.
+
+ Creates separate PDFs for the email body and HTML content, then
+ merges them according to the requested layout.
+
+ Parameters
+ ----------
+ mail_message:
+ Parsed email message.
+ pdf_layout:
+ Layout option for the PDF. Falls back to the
+ EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
+
+ Returns
+ -------
+ Path
+ Path to the generated PDF inside the temporary directory.
+ """
+ archive_path = Path(self._tempdir) / "merged.pdf"
+
+ mail_pdf_file = self.generate_pdf_from_mail(mail_message)
+
+ if pdf_layout is None:
+ pdf_layout = MailRule.PdfLayout(settings.EMAIL_PARSE_DEFAULT_LAYOUT)
+
+ # If no HTML content, create the PDF from the message.
+ # Otherwise, create 2 PDFs and merge them with Gotenberg.
+ if not mail_message.html:
+ archive_path.write_bytes(mail_pdf_file.read_bytes())
+ else:
+ pdf_of_html_content = self.generate_pdf_from_html(
+ mail_message.html,
+ mail_message.attachments,
+ )
+
+ logger.debug("Merging email text and HTML content into single PDF")
+
+ with (
+ GotenbergClient(
+ host=settings.TIKA_GOTENBERG_ENDPOINT,
+ timeout=settings.CELERY_TASK_TIME_LIMIT,
+ ) as client,
+ client.merge.merge() as route,
+ ):
+ # Configure requested PDF/A formatting, if any
+ pdf_a_format = self._settings_to_gotenberg_pdfa()
+ if pdf_a_format is not None:
+ route.pdf_format(pdf_a_format)
+
+ match pdf_layout:
+ case MailRule.PdfLayout.HTML_TEXT:
+ route.merge([pdf_of_html_content, mail_pdf_file])
+ case MailRule.PdfLayout.HTML_ONLY:
+ route.merge([pdf_of_html_content])
+ case MailRule.PdfLayout.TEXT_ONLY:
+ route.merge([mail_pdf_file])
+ case MailRule.PdfLayout.TEXT_HTML | _:
+ route.merge([mail_pdf_file, pdf_of_html_content])
+
+ try:
+ response = route.run()
+ archive_path.write_bytes(response.content)
+ except Exception as err:
+ raise ParseError(
+ f"Error while merging email HTML into PDF: {err}",
+ ) from err
+
+ return archive_path
+
+ def mail_to_html(self, mail: MailMessage) -> Path:
+ """Convert the given email into an HTML file using a template.
+
+ Parameters
+ ----------
+ mail:
+ Parsed mail message.
+
+ Returns
+ -------
+ Path
+ Path to the rendered HTML file inside the temporary directory.
+ """
+
+ def clean_html(text: str) -> str:
+ """Attempt to clean, escape, and linkify the given HTML string."""
+ if isinstance(text, list):
+ text = "\n".join([str(e) for e in text])
+ if not isinstance(text, str):
+ text = str(text)
+ text = escape(text)
+ text = clean(text)
+ text = linkify(text, parse_email=True)
+ text = text.replace("\n", "
")
+ return text
+
+ data = {}
+
+ data["subject"] = clean_html(mail.subject)
+ if data["subject"]:
+ data["subject_label"] = "Subject"
+ data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
+ if data["from"]:
+ data["from_label"] = "From"
+ data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
+ if data["to"]:
+ data["to_label"] = "To"
+ data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
+ if data["cc"]:
+ data["cc_label"] = "CC"
+ data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
+ if data["bcc"]:
+ data["bcc_label"] = "BCC"
+
+ att = []
+ for a in mail.attachments:
+ att.append(
+ f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
+ )
+ data["attachments"] = clean_html(", ".join(att))
+ if data["attachments"]:
+ data["attachments_label"] = "Attachments"
+
+ data["date"] = clean_html(
+ timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
+ )
+ data["content"] = clean_html(mail.text.strip())
+
+ from django.template.loader import render_to_string
+
+ html_file = Path(self._tempdir) / "email_as_html.html"
+ html_file.write_text(render_to_string("email_msg_template.html", context=data))
+
+ return html_file
+
+ def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
+ """Create a PDF from the email body using an HTML template and Gotenberg.
+
+ Parameters
+ ----------
+ mail:
+ Parsed mail message.
+
+ Returns
+ -------
+ Path
+ Path to the generated PDF inside the temporary directory.
+
+ Raises
+ ------
+ documents.parsers.ParseError
+ If Gotenberg returns an error.
+ """
+ logger.info("Converting mail to PDF")
+
+ css_file = (
+ Path(__file__).parent.parent.parent
+ / "paperless_mail"
+ / "templates"
+ / "output.css"
+ )
+ email_html_file = self.mail_to_html(mail)
+
+ with (
+ GotenbergClient(
+ host=settings.TIKA_GOTENBERG_ENDPOINT,
+ timeout=settings.CELERY_TASK_TIME_LIMIT,
+ ) as client,
+ client.chromium.html_to_pdf() as route,
+ ):
+ # Configure requested PDF/A formatting, if any
+ pdf_a_format = self._settings_to_gotenberg_pdfa()
+ if pdf_a_format is not None:
+ route.pdf_format(pdf_a_format)
+
+ try:
+ response = (
+ route.index(email_html_file)
+ .resource(css_file)
+ .margins(
+ PageMarginsType(
+ top=Measurement(0.1, MeasurementUnitType.Inches),
+ bottom=Measurement(0.1, MeasurementUnitType.Inches),
+ left=Measurement(0.1, MeasurementUnitType.Inches),
+ right=Measurement(0.1, MeasurementUnitType.Inches),
+ ),
+ )
+ .size(A4)
+ .scale(1.0)
+ .run()
+ )
+ except Exception as err:
+ raise ParseError(
+ f"Error while converting email to PDF: {err}",
+ ) from err
+
+ email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
+ email_as_pdf_file.write_bytes(response.content)
+
+ return email_as_pdf_file
+
+ def generate_pdf_from_html(
+ self,
+ orig_html: str,
+ attachments: list[MailAttachment],
+ ) -> Path:
+ """Generate a PDF from the HTML content of the email.
+
+ Parameters
+ ----------
+ orig_html:
+ Raw HTML string from the email body.
+ attachments:
+ List of email attachments (used as inline resources).
+
+ Returns
+ -------
+ Path
+ Path to the generated PDF inside the temporary directory.
+
+ Raises
+ ------
+ documents.parsers.ParseError
+ If Gotenberg returns an error.
+ """
+
+ def clean_html_script(text: str) -> str:
+ compiled_open = re.compile(re.escape(" None:
- """
- This parser does not implement additional settings yet
- """
- return None
diff --git a/src/paperless_mail/signals.py b/src/paperless_mail/signals.py
index 4a1d82df6..8fe046393 100644
--- a/src/paperless_mail/signals.py
+++ b/src/paperless_mail/signals.py
@@ -1,7 +1,12 @@
def get_parser(*args, **kwargs):
- from paperless_mail.parsers import MailDocumentParser
+ from paperless.parsers.mail import MailDocumentParser
- return MailDocumentParser(*args, **kwargs)
+ # MailDocumentParser accepts no constructor args in the new-style protocol.
+ # Pop legacy args that arrive from the signal-based consumer path.
+ # Phase 4 will replace this signal path with the ParserRegistry.
+ kwargs.pop("logging_group", None)
+ kwargs.pop("progress_callback", None)
+ return MailDocumentParser()
def mail_consumer_declaration(sender, **kwargs):
diff --git a/src/paperless_mail/tests/conftest.py b/src/paperless_mail/tests/conftest.py
index 0742edfa3..b662d46f3 100644
--- a/src/paperless_mail/tests/conftest.py
+++ b/src/paperless_mail/tests/conftest.py
@@ -1,71 +1,9 @@
from collections.abc import Generator
-from pathlib import Path
import pytest
from paperless_mail.mail import MailAccountHandler
from paperless_mail.models import MailAccount
-from paperless_mail.parsers import MailDocumentParser
-
-
-@pytest.fixture(scope="session")
-def sample_dir() -> Path:
- return (Path(__file__).parent / Path("samples")).resolve()
-
-
-@pytest.fixture(scope="session")
-def broken_email_file(sample_dir: Path) -> Path:
- return sample_dir / "broken.eml"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_file(sample_dir: Path) -> Path:
- return sample_dir / "simple_text.eml"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
- return sample_dir / "simple_text.eml.pdf"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
- return sample_dir / "simple_text.eml.pdf.webp"
-
-
-@pytest.fixture(scope="session")
-def html_email_file(sample_dir: Path) -> Path:
- return sample_dir / "html.eml"
-
-
-@pytest.fixture(scope="session")
-def html_email_pdf_file(sample_dir: Path) -> Path:
- return sample_dir / "html.eml.pdf"
-
-
-@pytest.fixture(scope="session")
-def html_email_thumbnail_file(sample_dir: Path) -> Path:
- return sample_dir / "html.eml.pdf.webp"
-
-
-@pytest.fixture(scope="session")
-def html_email_html_file(sample_dir: Path) -> Path:
- return sample_dir / "html.eml.html"
-
-
-@pytest.fixture(scope="session")
-def merged_pdf_first(sample_dir: Path) -> Path:
- return sample_dir / "first.pdf"
-
-
-@pytest.fixture(scope="session")
-def merged_pdf_second(sample_dir: Path) -> Path:
- return sample_dir / "second.pdf"
-
-
-@pytest.fixture()
-def mail_parser() -> MailDocumentParser:
- return MailDocumentParser(logging_group=None)
@pytest.fixture()
@@ -89,11 +27,3 @@ def greenmail_mail_account(db: None) -> Generator[MailAccount, None, None]:
@pytest.fixture()
def mail_account_handler() -> MailAccountHandler:
return MailAccountHandler()
-
-
-@pytest.fixture(scope="session")
-def nginx_base_url() -> Generator[str, None, None]:
- """
- The base URL for the nginx HTTP server we expect to be alive
- """
- yield "http://localhost:8080"