Feat(parsers): migrate MailDocumentParser to ParserProtocol

Move the mail parser from paperless_mail/parsers.py to paperless/parsers/mail.py and refactor it to implement ParserProtocol: - Class-level name/version/author/url attributes - supported_mime_types() and score() classmethods (score=20) - can_produce_archive=False, requires_pdf_rendition=True - Context manager lifecycle (__enter__/__exit__) - New parse() signature without mailrule_id kwarg; consumer sets parser.mailrule_id before calling parse() instead - get_text()/get_date()/get_archive_path() accessor methods - extract_metadata() returning email headers and attachment info Register MailDocumentParser in the ParserRegistry alongside Text and Tika parsers. Update consumer, signals, and all import sites to use the new location. Update tests to use the new accessor API, patch paths, and context-manager fixture. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 07:05:24 +00:00 · 2026-03-18 14:41:26 -07:00
parent d107c8c531
commit 3236bbd0c5
8 changed files with 523 additions and 175 deletions
@@ -51,9 +51,9 @@ from documents.templating.workflows import parse_w_workflow_placeholders
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
+from paperless.parsers.mail import MailDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
-from paperless_mail.parsers import MailDocumentParser

 LOGGING_NAME: Final[str] = "paperless.consumer"

@@ -68,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:

    TODO(stumpylog): Remove me in the future
    """
-    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
+    if isinstance(parser, (MailDocumentParser, TextDocumentParser, TikaDocumentParser)):
        parser.__exit__(None, None, None)
    else:
        parser.cleanup()
@@ -477,14 +477,12 @@ class ConsumerPlugin(
                isinstance(document_parser, MailDocumentParser)
                and self.input_doc.mailrule_id
            ):
-                document_parser.parse(
-                    self.working_copy,
-                    mime_type,
-                    self.filename,
-                    self.input_doc.mailrule_id,
-                )
-            elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
-                # TODO(stumpylog): Remove me in the future
+                document_parser.mailrule_id = self.input_doc.mailrule_id
+            if isinstance(
+                document_parser,
+                (MailDocumentParser, TextDocumentParser, TikaDocumentParser),
+            ):
+                # TODO(stumpylog): Remove me in the future when all parsers use new protocol
                document_parser.parse(self.working_copy, mime_type)
            else:
                document_parser.parse(self.working_copy, mime_type, self.filename)
@@ -496,8 +494,11 @@ class ConsumerPlugin(
                ProgressStatusOptions.WORKING,
                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
            )
-            if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
-                # TODO(stumpylog): Remove me in the future
+            if isinstance(
+                document_parser,
+                (MailDocumentParser, TextDocumentParser, TikaDocumentParser),
+            ):
+                # TODO(stumpylog): Remove me in the future when all parsers use new protocol
                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
            else:
                thumbnail = document_parser.get_thumbnail(
@@ -35,8 +35,8 @@ from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import DummyProgressManager
 from documents.tests.utils import FileSystemAssertsMixin
 from documents.tests.utils import GetConsumerMixin
+from paperless.parsers.mail import MailDocumentParser
 from paperless_mail.models import MailRule
-from paperless_mail.parsers import MailDocumentParser


 class _BaseTestParser(DocumentParser):
@@ -1091,7 +1091,7 @@ class TestConsumer(
            self.assertEqual(command[1], "--replace-input")

    @mock.patch("paperless_mail.models.MailRule.objects.get")
-    @mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
+    @mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def test_mail_parser_receives_mailrule(
        self,
@@ -1123,9 +1123,10 @@ class TestConsumer(
        with self.get_consumer(
            filepath=(
                Path(__file__).parent.parent.parent
-                / Path("paperless_mail")
+                / Path("paperless")
                / Path("tests")
                / Path("samples")
+                / Path("mail")
            ).resolve()
            / "html.eml",
            source=DocumentSource.MailFetch,
@@ -1,6 +1,26 @@
+"""
+Built-in mail document parser.
+
+Handles message/rfc822 (EML) MIME type by:
+- Parsing the email using imap_tools
+- Generating a PDF via Gotenberg (for display and archive)
+- Extracting text via Tika for HTML content
+- Extracting metadata from email headers
+
+The parser always produces a PDF because EML files cannot be rendered
+natively in a browser (requires_pdf_rendition=True).
+"""
+
+from __future__ import annotations
+
+import logging
 import re
+import shutil
+import tempfile
 from html import escape
 from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Self

 from bleach import clean
 from bleach import linkify
@@ -19,65 +39,353 @@ from imap_tools import MailAttachment
 from imap_tools import MailMessage
 from tika_client import TikaClient

-from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from paperless.models import OutputTypeChoices
+from paperless.version import __full_version_str__
 from paperless_mail.models import MailRule

+if TYPE_CHECKING:
+    import datetime
+    from types import TracebackType

-class MailDocumentParser(DocumentParser):
-    """
-    This parser uses imap_tools to parse .eml files, generates pdf using
-    Gotenberg and sends the html part to a Tika server for text extraction.
+    from paperless.parsers import MetadataEntry
+
+logger = logging.getLogger("paperless.parsing.mail")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+    "message/rfc822": ".eml",
+}
+
+
+class MailDocumentParser:
+    """Parse .eml email files for Paperless-ngx.
+
+    Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
+    and sends the HTML part to a Tika server for text extraction.  Because
+    EML files cannot be rendered natively in a browser, the parser always
+    produces a PDF rendition (requires_pdf_rendition=True).
+
+    The mailrule_id instance attribute may be set by the consumer before
+    calling parse() to apply mail-rule-specific PDF layout options:
+
+        parser.mailrule_id = rule.pk
+        parser.parse(path, mime_type)
+
+    Class attributes
+    ----------------
+    name : str
+        Human-readable parser name.
+    version : str
+        Semantic version string, kept in sync with Paperless-ngx releases.
+    author : str
+        Maintainer name.
+    url : str
+        Issue tracker / source URL.
    """

-    logging_name = "paperless.parsing.mail"
+    name: str = "Paperless-ngx Mail Parser"
+    version: str = __full_version_str__
+    author: str = "Paperless-ngx Contributors"
+    url: str = "https://github.com/paperless-ngx/paperless-ngx"

-    def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        """Return the MIME types this parser handles.
+
+        Returns
+        -------
+        dict[str, str]
+            Mapping of MIME type to preferred file extension.
        """
-        Converts our requested PDF/A output into the Gotenberg API
-        format
+        return _SUPPORTED_MIME_TYPES
+
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        """Return the priority score for handling this file.
+
+        Parameters
+        ----------
+        mime_type:
+            Detected MIME type of the file.
+        filename:
+            Original filename including extension.
+        path:
+            Optional filesystem path. Not inspected by this parser.
+
+        Returns
+        -------
+        int | None
+            20 if the MIME type is supported (higher than the default 10 to
+            give the mail parser clear priority), otherwise None.
        """
-        if settings.OCR_OUTPUT_TYPE in {
-            OutputTypeChoices.PDF_A,
-            OutputTypeChoices.PDF_A2,
-        }:
-            return PdfAFormat.A2b
-        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:  # pragma: no cover
-            self.log.warning(
-                "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
-            )
-            return PdfAFormat.A2b
-        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:  # pragma: no cover
-            return PdfAFormat.A3b
+        if mime_type in _SUPPORTED_MIME_TYPES:
+            return 20
        return None

+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def can_produce_archive(self) -> bool:
+        """Whether this parser can produce a searchable PDF archive copy.
+
+        Returns
+        -------
+        bool
+            Always False — the mail parser produces a display PDF
+            (requires_pdf_rendition=True), not an optional OCR archive.
+        """
+        return False
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        """Whether the parser must produce a PDF for the frontend to display.
+
+        Returns
+        -------
+        bool
+            Always True — EML files cannot be rendered natively in a browser,
+            so a PDF conversion is always required for display.
+        """
+        return True
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def __init__(self, logging_group: object = None) -> None:
+        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+        self._tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
+        self._text: str | None = None
+        self._date: datetime.datetime | None = None
+        self._archive_path: Path | None = None
+        self.mailrule_id: int | None = None
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        logger.debug("Cleaning up temporary directory %s", self._tempdir)
+        shutil.rmtree(self._tempdir, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
+        """Parse the given .eml into formatted text and a PDF archive.
+
+        The consumer may set ``self.mailrule_id`` before calling this method
+        to apply mail-rule-specific PDF layout options.  The ``produce_archive``
+        flag is accepted for protocol compatibility but is always honoured —
+        the mail parser always produces a PDF since EML files cannot be
+        displayed natively.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the .eml file.
+        mime_type:
+            Detected MIME type of the document (should be "message/rfc822").
+        produce_archive:
+            Accepted for protocol compatibility. The PDF rendition is always
+            produced since EML files cannot be displayed natively in a browser.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If the file cannot be parsed or PDF generation fails.
+        """
+
+        def strip_text(text: str) -> str:
+            """Reduces the spacing of the given text string."""
+            text = re.sub(r"\s+", " ", text)
+            text = re.sub(r"(\n *)+", "\n", text)
+            return text.strip()
+
+        def build_formatted_text(mail_message: MailMessage) -> str:
+            """Constructs a formatted string based on the given email."""
+            fmt_text = f"Subject: {mail_message.subject}\n\n"
+            fmt_text += f"From: {mail_message.from_values.full}\n\n"
+            to_list = [address.full for address in mail_message.to_values]
+            fmt_text += f"To: {', '.join(to_list)}\n\n"
+            if mail_message.cc_values:
+                fmt_text += (
+                    f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
+                )
+            if mail_message.bcc_values:
+                fmt_text += (
+                    f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
+                )
+            if mail_message.attachments:
+                att = []
+                for a in mail.attachments:
+                    attachment_size = naturalsize(a.size, binary=True, format="%.2f")
+                    att.append(
+                        f"{a.filename} ({attachment_size})",
+                    )
+                fmt_text += f"Attachments: {', '.join(att)}\n\n"
+
+            if mail.html:
+                fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
+
+            fmt_text += f"\n\n{strip_text(mail.text)}"
+
+            return fmt_text
+
+        logger.debug("Parsing file %s into an email", document_path.name)
+        mail = self.parse_file_to_message(document_path)
+
+        logger.debug("Building formatted text from email")
+        self._text = build_formatted_text(mail)
+
+        if is_naive(mail.date):
+            self._date = make_aware(mail.date)
+        else:
+            self._date = mail.date
+
+        logger.debug("Creating a PDF from the email")
+        if self.mailrule_id:
+            rule = MailRule.objects.get(pk=self.mailrule_id)
+            self._archive_path = self.generate_pdf(mail, rule.pdf_layout)
+        else:
+            self._archive_path = self.generate_pdf(mail)
+
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
+
+    def get_text(self) -> str | None:
+        """Return the plain-text content extracted during parse.
+
+        Returns
+        -------
+        str | None
+            Extracted text, or None if parse has not been called yet.
+        """
+        return self._text
+
+    def get_date(self) -> datetime.datetime | None:
+        """Return the document date detected during parse.
+
+        Returns
+        -------
+        datetime.datetime | None
+            Date from the email headers, or None if not detected.
+        """
+        return self._date
+
+    def get_archive_path(self) -> Path | None:
+        """Return the path to the generated archive PDF, or None.
+
+        Returns
+        -------
+        Path | None
+            Path to the PDF produced by Gotenberg, or None if parse has not
+            been called yet.
+        """
+        return self._archive_path
+
+    # ------------------------------------------------------------------
+    # Thumbnail and metadata
+    # ------------------------------------------------------------------
+
    def get_thumbnail(
        self,
        document_path: Path,
        mime_type: str,
-        file_name=None,
+        file_name: str | None = None,
    ) -> Path:
-        if not self.archive_path:
-            self.archive_path = self.generate_pdf(
+        """Generate a thumbnail from the PDF rendition of the email.
+
+        Converts the document to PDF first if not already done.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+        file_name:
+            Kept for backward compatibility; not used.
+
+        Returns
+        -------
+        Path
+            Path to the generated WebP thumbnail inside the temporary directory.
+        """
+        if not self._archive_path:
+            self._archive_path = self.generate_pdf(
                self.parse_file_to_message(document_path),
            )

        return make_thumbnail_from_pdf(
-            self.archive_path,
-            self.tempdir,
-            self.logging_group,
+            self._archive_path,
+            self._tempdir,
        )

-    def extract_metadata(self, document_path: Path, mime_type: str):
-        result = []
+    def get_page_count(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> int | None:
+        """Return the number of pages in the document.
+
+        Returns
+        -------
+        int | None
+            Always None — page count is not available for email files.
+        """
+        return None
+
+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract metadata from the email headers.
+
+        Returns email headers as metadata entries with prefix "header",
+        plus summary entries for attachments and date.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            Sorted list of metadata entries, or ``[]`` on parse failure.
+        """
+        result: list[MetadataEntry] = []

        try:
            mail = self.parse_file_to_message(document_path)
        except ParseError as e:
-            self.log.warning(
-                f"Error while fetching document metadata for {document_path}: {e}",
+            logger.warning(
+                "Error while fetching document metadata for %s: %s",
+                document_path,
+                e,
            )
            return result

@@ -86,7 +394,7 @@ class MailDocumentParser(DocumentParser):
            try:
                value.encode("utf-8")
            except UnicodeEncodeError as e:  # pragma: no cover
-                self.log.debug(f"Skipping header {key}: {e}")
+                logger.debug("Skipping header %s: %s", key, e)
                continue

            result.append(
@@ -123,81 +431,44 @@ class MailDocumentParser(DocumentParser):
        result.sort(key=lambda item: (item["prefix"], item["key"]))
        return result

-    def parse(
-        self,
-        document_path: Path,
-        mime_type: str,
-        file_name=None,
-        mailrule_id: int | None = None,
-    ) -> None:
-        """
-        Parses the given .eml into formatted text, based on the decoded email.
+    # ------------------------------------------------------------------
+    # Email-specific methods
+    # ------------------------------------------------------------------

-        """
-
-        def strip_text(text: str):
-            """
-            Reduces the spacing of the given text string
-            """
-            text = re.sub(r"\s+", " ", text)
-            text = re.sub(r"(\n *)+", "\n", text)
-            return text.strip()
-
-        def build_formatted_text(mail_message: MailMessage) -> str:
-            """
-            Constructs a formatted string, based on the given email.  Basically tries
-            to get most of the email content, included front matter, into a nice string
-            """
-            fmt_text = f"Subject: {mail_message.subject}\n\n"
-            fmt_text += f"From: {mail_message.from_values.full}\n\n"
-            to_list = [address.full for address in mail_message.to_values]
-            fmt_text += f"To: {', '.join(to_list)}\n\n"
-            if mail_message.cc_values:
-                fmt_text += (
-                    f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
-                )
-            if mail_message.bcc_values:
-                fmt_text += (
-                    f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
-                )
-            if mail_message.attachments:
-                att = []
-                for a in mail.attachments:
-                    attachment_size = naturalsize(a.size, binary=True, format="%.2f")
-                    att.append(
-                        f"{a.filename} ({attachment_size})",
-                    )
-                fmt_text += f"Attachments: {', '.join(att)}\n\n"
-
-            if mail.html:
-                fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
-
-            fmt_text += f"\n\n{strip_text(mail.text)}"
-
-            return fmt_text
-
-        self.log.debug(f"Parsing file {document_path.name} into an email")
-        mail = self.parse_file_to_message(document_path)
-
-        self.log.debug("Building formatted text from email")
-        self.text = build_formatted_text(mail)
-
-        if is_naive(mail.date):
-            self.date = make_aware(mail.date)
-        else:
-            self.date = mail.date
-
-        self.log.debug("Creating a PDF from the email")
-        if mailrule_id:
-            rule = MailRule.objects.get(pk=mailrule_id)
-            self.archive_path = self.generate_pdf(mail, rule.pdf_layout)
-        else:
-            self.archive_path = self.generate_pdf(mail)
+    def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
+        """Convert the OCR output type setting to a Gotenberg PdfAFormat."""
+        if settings.OCR_OUTPUT_TYPE in {
+            OutputTypeChoices.PDF_A,
+            OutputTypeChoices.PDF_A2,
+        }:
+            return PdfAFormat.A2b
+        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:  # pragma: no cover
+            logger.warning(
+                "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
+            )
+            return PdfAFormat.A2b
+        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:  # pragma: no cover
+            return PdfAFormat.A3b
+        return None

    @staticmethod
    def parse_file_to_message(filepath: Path) -> MailMessage:
-        """
-        Parses the given .eml file into a MailMessage object
+        """Parse the given .eml file into a MailMessage object.
+
+        Parameters
+        ----------
+        filepath:
+            Path to the .eml file.
+
+        Returns
+        -------
+        MailMessage
+            Parsed mail message.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If the file cannot be parsed or is missing required fields.
        """
        try:
            with filepath.open("rb") as eml:
@@ -213,8 +484,25 @@ class MailDocumentParser(DocumentParser):

        return parsed

-    def tika_parse(self, html: str):
-        self.log.info("Sending content to Tika server")
+    def tika_parse(self, html: str) -> str:
+        """Send HTML content to the Tika server for text extraction.
+
+        Parameters
+        ----------
+        html:
+            HTML string to parse.
+
+        Returns
+        -------
+        str
+            Extracted plain text.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If the Tika server cannot be reached or returns an error.
+        """
+        logger.info("Sending content to Tika server")

        try:
            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
@@ -234,16 +522,32 @@ class MailDocumentParser(DocumentParser):
        mail_message: MailMessage,
        pdf_layout: MailRule.PdfLayout | None = None,
    ) -> Path:
-        archive_path = Path(self.tempdir) / "merged.pdf"
+        """Generate a PDF from the email message.
+
+        Creates separate PDFs for the email body and HTML content, then
+        merges them according to the requested layout.
+
+        Parameters
+        ----------
+        mail_message:
+            Parsed email message.
+        pdf_layout:
+            Layout option for the PDF. Falls back to the
+            EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
+
+        Returns
+        -------
+        Path
+            Path to the generated PDF inside the temporary directory.
+        """
+        archive_path = Path(self._tempdir) / "merged.pdf"

        mail_pdf_file = self.generate_pdf_from_mail(mail_message)

-        pdf_layout = (
-            pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
-        )  # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout
+        pdf_layout = pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT

-        # If no HTML content, create the PDF from the message
-        # Otherwise, create 2 PDFs and merge them with Gotenberg
+        # If no HTML content, create the PDF from the message.
+        # Otherwise, create 2 PDFs and merge them with Gotenberg.
        if not mail_message.html:
            archive_path.write_bytes(mail_pdf_file.read_bytes())
        else:
@@ -252,7 +556,7 @@ class MailDocumentParser(DocumentParser):
                mail_message.attachments,
            )

-            self.log.debug("Merging email text and HTML content into single PDF")
+            logger.debug("Merging email text and HTML content into single PDF")

            with (
                GotenbergClient(
@@ -287,15 +591,21 @@ class MailDocumentParser(DocumentParser):
        return archive_path

    def mail_to_html(self, mail: MailMessage) -> Path:
-        """
-        Converts the given email into an HTML file, formatted
-        based on the given template
+        """Convert the given email into an HTML file using a template.
+
+        Parameters
+        ----------
+        mail:
+            Parsed mail message.
+
+        Returns
+        -------
+        Path
+            Path to the rendered HTML file inside the temporary directory.
        """

        def clean_html(text: str) -> str:
-            """
-            Attempts to clean, escape and linkify the given HTML string
-            """
+            """Attempt to clean, escape, and linkify the given HTML string."""
            if isinstance(text, list):
                text = "\n".join([str(e) for e in text])
            if not isinstance(text, str):
@@ -340,19 +650,37 @@ class MailDocumentParser(DocumentParser):

        from django.template.loader import render_to_string

-        html_file = Path(self.tempdir) / "email_as_html.html"
+        html_file = Path(self._tempdir) / "email_as_html.html"
        html_file.write_text(render_to_string("email_msg_template.html", context=data))

        return html_file

    def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
-        """
-        Creates a PDF based on the given email, using the email's values in a
-        an HTML template
-        """
-        self.log.info("Converting mail to PDF")
+        """Create a PDF from the email body using an HTML template and Gotenberg.

-        css_file = Path(__file__).parent / "templates" / "output.css"
+        Parameters
+        ----------
+        mail:
+            Parsed mail message.
+
+        Returns
+        -------
+        Path
+            Path to the generated PDF inside the temporary directory.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If Gotenberg returns an error.
+        """
+        logger.info("Converting mail to PDF")
+
+        css_file = (
+            Path(__file__).parent.parent.parent
+            / "paperless_mail"
+            / "templates"
+            / "output.css"
+        )
        email_html_file = self.mail_to_html(mail)

        with (
@@ -388,7 +716,7 @@ class MailDocumentParser(DocumentParser):
                    f"Error while converting email to PDF: {err}",
                ) from err

-        email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
+        email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
        email_as_pdf_file.write_bytes(response.content)

        return email_as_pdf_file
@@ -398,11 +726,27 @@ class MailDocumentParser(DocumentParser):
        orig_html: str,
        attachments: list[MailAttachment],
    ) -> Path:
-        """
-        Generates a PDF file based on the HTML and attachments of the email
+        """Generate a PDF from the HTML content of the email.
+
+        Parameters
+        ----------
+        orig_html:
+            Raw HTML string from the email body.
+        attachments:
+            List of email attachments (used as inline resources).
+
+        Returns
+        -------
+        Path
+            Path to the generated PDF inside the temporary directory.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If Gotenberg returns an error.
        """

-        def clean_html_script(text: str):
+        def clean_html_script(text: str) -> str:
            compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
            text = compiled_open.sub("<div hidden ", text)

@@ -410,9 +754,9 @@ class MailDocumentParser(DocumentParser):
            text = compiled_close.sub("</div", text)
            return text

-        self.log.info("Converting message html to PDF")
+        logger.info("Converting message html to PDF")

-        tempdir = Path(self.tempdir)
+        tempdir = Path(self._tempdir)

        html_clean = clean_html_script(orig_html)
        html_clean_file = tempdir / "index.html"
@@ -473,9 +817,3 @@ class MailDocumentParser(DocumentParser):
        html_pdf = tempdir / "html.pdf"
        html_pdf.write_bytes(response.content)
        return html_pdf
-
-    def get_settings(self) -> None:
-        """
-        This parser does not implement additional settings yet
-        """
-        return None
@@ -193,11 +193,13 @@ class ParserRegistry:
        that log output is predictable; scoring determines which parser wins
        at runtime regardless of registration order.
        """
+        from paperless.parsers.mail import MailDocumentParser
        from paperless.parsers.text import TextDocumentParser
        from paperless.parsers.tika import TikaDocumentParser

        self.register_builtin(TextDocumentParser)
        self.register_builtin(TikaDocumentParser)
+        self.register_builtin(MailDocumentParser)

    # ------------------------------------------------------------------
    # Discovery
@@ -24,7 +24,7 @@ class TestEmailFileParsing:
    def test_parse_error_missing_file(
        self,
        mail_parser: MailDocumentParser,
-        sample_dir: Path,
+        mail_samples_dir: Path,
    ) -> None:
        """
        GIVEN:
@@ -35,7 +35,7 @@ class TestEmailFileParsing:
            - An Exception is thrown
        """
        # Check if exception is raised when parsing fails.
-        test_file = sample_dir / "doesntexist.eml"
+        test_file = mail_samples_dir / "doesntexist.eml"

        assert not test_file.exists()

@@ -246,12 +246,12 @@ class TestEmailThumbnailGenerate:
        """
        mocked_return = "Passing the return value through.."
        mock_make_thumbnail_from_pdf = mocker.patch(
-            "paperless_mail.parsers.make_thumbnail_from_pdf",
+            "paperless.parsers.mail.make_thumbnail_from_pdf",
        )
        mock_make_thumbnail_from_pdf.return_value = mocked_return

        mock_generate_pdf = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
        )
        mock_generate_pdf.return_value = "Mocked return value.."

@@ -260,8 +260,7 @@ class TestEmailThumbnailGenerate:
        mock_generate_pdf.assert_called_once()
        mock_make_thumbnail_from_pdf.assert_called_once_with(
            "Mocked return value..",
-            mail_parser.tempdir,
-            None,
+            mail_parser._tempdir,
        )

        assert mocked_return == thumb
@@ -373,7 +372,7 @@ class TestParser:
        """
        # Validate parsing returns the expected results
        mock_generate_pdf = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
        )

        mail_parser.parse(simple_txt_email_file, "message/rfc822")
@@ -385,7 +384,7 @@ class TestParser:
            "BCC: fdf@fvf.de\n\n"
            "\n\nThis is just a simple Text Mail."
        )
-        assert text_expected == mail_parser.text
+        assert text_expected == mail_parser.get_text()
        assert (
            datetime.datetime(
                2022,
@@ -396,7 +395,7 @@ class TestParser:
                43,
                tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
            )
-            == mail_parser.date
+            == mail_parser.get_date()
        )

        # Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
@@ -419,7 +418,7 @@ class TestParser:
        """

        mock_generate_pdf = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
        )

        # Validate parsing returns the expected results
@@ -443,7 +442,7 @@ class TestParser:
        mail_parser.parse(html_email_file, "message/rfc822")

        mock_generate_pdf.assert_called_once()
-        assert text_expected == mail_parser.text
+        assert text_expected == mail_parser.get_text()
        assert (
            datetime.datetime(
                2022,
@@ -454,7 +453,7 @@ class TestParser:
                19,
                tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
            )
-            == mail_parser.date
+            == mail_parser.get_date()
        )

    def test_generate_pdf_parse_error(
@@ -501,7 +500,7 @@ class TestParser:

        mail_parser.parse(simple_txt_email_file, "message/rfc822")

-        assert mail_parser.archive_path is not None
+        assert mail_parser.get_archive_path() is not None

    @pytest.mark.httpx_mock(can_send_already_matched_responses=True)
    def test_generate_pdf_html_email(
@@ -542,7 +541,7 @@ class TestParser:
        )
        mail_parser.parse(html_email_file, "message/rfc822")

-        assert mail_parser.archive_path is not None
+        assert mail_parser.get_archive_path() is not None

    def test_generate_pdf_html_email_html_to_pdf_failure(
        self,
@@ -712,10 +711,10 @@ class TestParser:

        def test_layout_option(layout_option, expected_calls, expected_pdf_names):
            mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
+            mail_parser.mailrule_id = 1
            mail_parser.parse(
                document_path=html_email_file,
                mime_type="message/rfc822",
-                mailrule_id=1,
            )
            args, _ = mock_merge_route.call_args
            assert len(args[0]) == expected_calls
@@ -159,7 +159,7 @@ class TestParserLive:
            - The returned thumbnail image file shall match the expected hash
        """
        mock_generate_pdf = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
        )
        mock_generate_pdf.return_value = simple_txt_email_pdf_file

@@ -216,10 +216,10 @@ class TestParserLive:
            - The merged PDF shall contain text from both source PDFs
        """
        mock_generate_pdf_from_html = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf_from_html",
        )
        mock_generate_pdf_from_mail = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf_from_mail",
        )
        mock_generate_pdf_from_mail.return_value = merged_pdf_first
        mock_generate_pdf_from_html.return_value = merged_pdf_second
@@ -1,5 +1,5 @@
 def get_parser(*args, **kwargs):
-    from paperless_mail.parsers import MailDocumentParser
+    from paperless.parsers.mail import MailDocumentParser

    return MailDocumentParser(*args, **kwargs)

@@ -3,14 +3,20 @@ from pathlib import Path

 import pytest

+from paperless.parsers.mail import MailDocumentParser
 from paperless_mail.mail import MailAccountHandler
 from paperless_mail.models import MailAccount
-from paperless_mail.parsers import MailDocumentParser


@pytest.fixture(scope="session")
 def sample_dir() -> Path:
-    return (Path(__file__).parent / Path("samples")).resolve()
+    return (
+        Path(__file__).parent.parent.parent
+        / Path("paperless")
+        / Path("tests")
+        / Path("samples")
+        / Path("mail")
+    ).resolve()


@pytest.fixture(scope="session")
@@ -64,8 +70,9 @@ def merged_pdf_second(sample_dir: Path) -> Path:


@pytest.fixture()
-def mail_parser() -> MailDocumentParser:
-    return MailDocumentParser(logging_group=None)
+def mail_parser() -> Generator[MailDocumentParser, None, None]:
+    with MailDocumentParser() as parser:
+        yield parser


@pytest.fixture()