Adds a docstring that an IDE will render better

Handles the rename of the migration
Fixes logging so I can see it
2026-03-18 23:15:57 +00:00 · 2026-03-18 15:26:10 -07:00 · 2026-03-18 15:23:37 -07:00 · 2026-03-18 15:22:18 -07:00 · 2026-03-18 15:22:18 -07:00 · 2026-03-18 15:22:17 -07:00
35 changed files with 454 additions and 737 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -256,7 +256,7 @@ lint.isort.force-single-line = true
 [tool.codespell]
 write-changes = true
 ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
-skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples/mail/*,src/documents/tests/samples/*,*.po,*.json"
+skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/documents/tests/samples/*,*.po,*.json"

 [tool.pytest]
 minversion = "9.0"
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,5 +1,4 @@
 import datetime
-import hashlib
 import os
 import tempfile
 from enum import StrEnum
@@ -48,12 +47,13 @@ from documents.signals import document_consumption_started
 from documents.signals import document_updated
 from documents.signals.handlers import run_workflows
 from documents.templating.workflows import parse_w_workflow_placeholders
+from documents.utils import compute_checksum
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
-from paperless.parsers.mail import MailDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
+from paperless_mail.parsers import MailDocumentParser

 LOGGING_NAME: Final[str] = "paperless.consumer"

@@ -68,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:

    TODO(stumpylog): Remove me in the future
    """
-    if isinstance(parser, (MailDocumentParser, TextDocumentParser, TikaDocumentParser)):
+    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
        parser.__exit__(None, None, None)
    else:
        parser.cleanup()
@@ -214,9 +214,7 @@ class ConsumerPlugin(
        version_doc = Document(
            root_document=root_doc_frozen,
            version_index=next_version_index + 1,
-            checksum=hashlib.md5(
-                file_for_checksum.read_bytes(),
-            ).hexdigest(),
+            checksum=compute_checksum(file_for_checksum),
            content=text or "",
            page_count=page_count,
            mime_type=mime_type,
@@ -477,12 +475,14 @@ class ConsumerPlugin(
                isinstance(document_parser, MailDocumentParser)
                and self.input_doc.mailrule_id
            ):
-                document_parser.mailrule_id = self.input_doc.mailrule_id
-            if isinstance(
-                document_parser,
-                (MailDocumentParser, TextDocumentParser, TikaDocumentParser),
-            ):
-                # TODO(stumpylog): Remove me in the future when all parsers use new protocol
+                document_parser.parse(
+                    self.working_copy,
+                    mime_type,
+                    self.filename,
+                    self.input_doc.mailrule_id,
+                )
+            elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+                # TODO(stumpylog): Remove me in the future
                document_parser.parse(self.working_copy, mime_type)
            else:
                document_parser.parse(self.working_copy, mime_type, self.filename)
@@ -494,11 +494,8 @@ class ConsumerPlugin(
                ProgressStatusOptions.WORKING,
                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
            )
-            if isinstance(
-                document_parser,
-                (MailDocumentParser, TextDocumentParser, TikaDocumentParser),
-            ):
-                # TODO(stumpylog): Remove me in the future when all parsers use new protocol
+            if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+                # TODO(stumpylog): Remove me in the future
                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
            else:
                thumbnail = document_parser.get_thumbnail(
@@ -688,10 +685,9 @@ class ConsumerPlugin(
                            document.archive_path,
                        )

-                        with Path(archive_path).open("rb") as f:
-                            document.archive_checksum = hashlib.md5(
-                                f.read(),
-                            ).hexdigest()
+                        document.archive_checksum = compute_checksum(
+                            Path(archive_path),
+                        )

                # Don't save with the lock active. Saving will cause the file
                # renaming logic to acquire the lock as well.
@@ -832,7 +828,7 @@ class ConsumerPlugin(
            title=title[:127],
            content=text,
            mime_type=mime_type,
-            checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
+            checksum=compute_checksum(file_for_checksum),
            created=create_date,
            modified=create_date,
            page_count=page_count,
@@ -949,10 +945,9 @@ class ConsumerPreflightPlugin(

    def pre_check_duplicate(self) -> None:
        """
-        Using the MD5 of the file, check this exact file doesn't already exist
+        Using the SHA256 of the file, check this exact file doesn't already exist
        """
-        with Path(self.input_doc.original_file).open("rb") as f:
-            checksum = hashlib.md5(f.read()).hexdigest()
+        checksum = compute_checksum(Path(self.input_doc.original_file))
        existing_doc = Document.global_objects.filter(
            Q(checksum=checksum) | Q(archive_checksum=checksum),
        )
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -56,6 +56,7 @@ from documents.models import WorkflowTrigger
 from documents.settings import EXPORTER_ARCHIVE_NAME
 from documents.settings import EXPORTER_FILE_NAME
 from documents.settings import EXPORTER_THUMBNAIL_NAME
+from documents.utils import compute_checksum
 from documents.utils import copy_file_with_basic_stats
 from paperless import version
 from paperless.models import ApplicationConfiguration
@@ -693,7 +694,7 @@ class Command(CryptMixin, PaperlessCommand):
            source_stat = source.stat()
            target_stat = target.stat()
            if self.compare_checksums and source_checksum:
-                target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
+                target_checksum = compute_checksum(target)
                perform_copy = target_checksum != source_checksum
            elif (
                source_stat.st_mtime != target_stat.st_mtime
--- a/src/documents/migrations/0016_sha256_checksums.py
+++ b/src/documents/migrations/0016_sha256_checksums.py
@@ -0,0 +1,130 @@
+import hashlib
+import logging
+from pathlib import Path
+
+from django.conf import settings
+from django.db import migrations
+from django.db import models
+
+logger = logging.getLogger("paperless.migrations")
+
+_CHUNK_SIZE = 65536  # 64 KiB — avoids loading entire files into memory
+_BATCH_SIZE = 500  # documents per bulk_update call
+_PROGRESS_INTERVAL = 500  # log a progress line every N documents
+
+
+def _sha256(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as fh:
+        while chunk := fh.read(_CHUNK_SIZE):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def recompute_checksums(apps, schema_editor):
+    """Recompute all document checksums from MD5 to SHA256."""
+    Document = apps.get_model("documents", "Document")
+
+    total = Document.objects.count()
+    if total == 0:
+        return
+
+    logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
+
+    batch: list = []
+    processed = 0
+
+    for doc in Document.objects.only(
+        "pk",
+        "filename",
+        "checksum",
+        "archive_filename",
+        "archive_checksum",
+    ).iterator(chunk_size=_BATCH_SIZE):
+        updated_fields: list[str] = []
+
+        # Reconstruct source path the same way Document.source_path does
+        fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
+        source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
+
+        if source_path.exists():
+            doc.checksum = _sha256(source_path)
+            updated_fields.append("checksum")
+        else:
+            logger.warning(
+                "Document %s: original file %s not found, checksum not updated.",
+                doc.pk,
+                source_path,
+            )
+
+        # Mirror Document.has_archive_version: archive_filename is not None
+        if doc.archive_filename is not None:
+            archive_path = (
+                settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
+            ).resolve()
+            if archive_path.exists():
+                doc.archive_checksum = _sha256(archive_path)
+                updated_fields.append("archive_checksum")
+            else:
+                logger.warning(
+                    "Document %s: archive file %s not found, checksum not updated.",
+                    doc.pk,
+                    archive_path,
+                )
+
+        if updated_fields:
+            batch.append(doc)
+
+        processed += 1
+
+        if len(batch) >= _BATCH_SIZE:
+            Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
+            batch.clear()
+
+        if processed % _PROGRESS_INTERVAL == 0:
+            logger.info(
+                "SHA-256 checksum progress: %d/%d (%d%%)",
+                processed,
+                total,
+                processed * 100 // total,
+            )
+
+    if batch:
+        Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
+
+    logger.info(
+        "SHA-256 checksum recomputation complete: %d document(s) processed.",
+        total,
+    )
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("documents", "0015_document_version_index_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="document",
+            name="checksum",
+            field=models.CharField(
+                editable=False,
+                help_text="The checksum of the original document.",
+                max_length=64,
+                verbose_name="checksum",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="document",
+            name="archive_checksum",
+            field=models.CharField(
+                blank=True,
+                editable=False,
+                help_text="The checksum of the archived document.",
+                max_length=64,
+                null=True,
+                verbose_name="archive checksum",
+            ),
+        ),
+        migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner):  # type: ignore[django-manager-

    checksum = models.CharField(
        _("checksum"),
-        max_length=32,
+        max_length=64,
        editable=False,
        help_text=_("The checksum of the original document."),
    )

    archive_checksum = models.CharField(
        _("archive checksum"),
-        max_length=32,
+        max_length=64,
        editable=False,
        blank=True,
        null=True,
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@@ -11,7 +11,6 @@ is an identity function that adds no overhead.

 from __future__ import annotations

-import hashlib
 import logging
 import uuid
 from collections import defaultdict
@@ -30,6 +29,7 @@ from django.utils import timezone

 from documents.models import Document
 from documents.models import PaperlessTask
+from documents.utils import compute_checksum
 from paperless.config import GeneralConfig

 logger = logging.getLogger("paperless.sanity_checker")
@@ -218,7 +218,7 @@ def _check_original(

    present_files.discard(source_path)
    try:
-        checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
+        checksum = compute_checksum(source_path)
    except OSError as e:
        messages.error(doc.pk, f"Cannot read original file of document: {e}")
    else:
@@ -255,7 +255,7 @@ def _check_archive(

        present_files.discard(archive_path)
        try:
-            checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
+            checksum = compute_checksum(archive_path)
        except OSError as e:
            messages.error(
                doc.pk,
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -1,5 +1,4 @@
 import datetime
-import hashlib
 import logging
 import shutil
 import uuid
@@ -63,6 +62,7 @@ from documents.signals import document_updated
 from documents.signals.handlers import cleanup_document_deletion
 from documents.signals.handlers import run_workflows
 from documents.signals.handlers import send_websocket_document_updated
+from documents.utils import compute_checksum
 from documents.workflows.utils import get_workflows_for_trigger
 from paperless.config import AIConfig
 from paperless_ai.indexing import llm_index_add_or_update_document
@@ -327,8 +327,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
        with transaction.atomic():
            oldDocument = Document.objects.get(pk=document.pk)
            if parser.get_archive_path():
-                with Path(parser.get_archive_path()).open("rb") as f:
-                    checksum = hashlib.md5(f.read()).hexdigest()
+                checksum = compute_checksum(Path(parser.get_archive_path()))
                # I'm going to save first so that in case the file move
                # fails, the database is rolled back.
                # We also don't use save() since that triggers the filehandling
--- a/src/documents/tests/conftest.py
+++ b/src/documents/tests/conftest.py
@@ -82,8 +82,8 @@ def sample_doc(

    return DocumentFactory(
        title="test",
-        checksum="42995833e01aea9b3edee44bbfdd7ce1",
-        archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
+        checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+        archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
        content="test content",
        pk=1,
        filename="0000001.pdf",
--- a/src/documents/tests/factories.py
+++ b/src/documents/tests/factories.py
@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
        model = Document

    title = factory.Faker("sentence", nb_words=4)
-    checksum = factory.Faker("md5")
+    checksum = factory.Faker("sha256")
    content = factory.Faker("paragraph")
    correspondent = None
    document_type = None
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -36,6 +36,7 @@ from documents.tests.utils import DummyProgressManager
 from documents.tests.utils import FileSystemAssertsMixin
 from documents.tests.utils import GetConsumerMixin
 from paperless_mail.models import MailRule
+from paperless_mail.parsers import MailDocumentParser


 class _BaseTestParser(DocumentParser):
@@ -244,8 +245,14 @@ class TestConsumer(

        self.assertIsFile(document.archive_path)

-        self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
-        self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
+        self.assertEqual(
+            document.checksum,
+            "1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+        )
+        self.assertEqual(
+            document.archive_checksum,
+            "706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
+        )

        self.assertIsNotFile(filename)

@@ -1090,7 +1097,7 @@ class TestConsumer(
            self.assertEqual(command[1], "--replace-input")

    @mock.patch("paperless_mail.models.MailRule.objects.get")
-    @mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
+    @mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def test_mail_parser_receives_mailrule(
        self,
@@ -1106,13 +1113,11 @@ class TestConsumer(
        THEN:
            - The mail parser should receive the mail rule
        """
-        from paperless_mail.signals import get_parser as mail_get_parser
-
        mock_consumer_declaration_send.return_value = [
            (
                None,
                {
-                    "parser": mail_get_parser,
+                    "parser": MailDocumentParser,
                    "mime_types": {"message/rfc822": ".eml"},
                    "weight": 0,
                },
@@ -1124,10 +1129,9 @@ class TestConsumer(
        with self.get_consumer(
            filepath=(
                Path(__file__).parent.parent.parent
-                / Path("paperless")
+                / Path("paperless_mail")
                / Path("tests")
                / Path("samples")
-                / Path("mail")
            ).resolve()
            / "html.eml",
            source=DocumentSource.MailFetch,
@@ -1138,10 +1142,12 @@ class TestConsumer(
                ConsumerError,
            ):
                consumer.run()
-            mock_mail_parser_parse.assert_called_once_with(
-                consumer.working_copy,
-                "message/rfc822",
-            )
+                mock_mail_parser_parse.assert_called_once_with(
+                    consumer.working_copy,
+                    "message/rfc822",
+                    file_name="sample.pdf",
+                    mailrule=mock_mailrule_get.return_value,
+                )


@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@@ -63,8 +63,8 @@ class TestExportImport(

        self.d1 = Document.objects.create(
            content="Content",
-            checksum="42995833e01aea9b3edee44bbfdd7ce1",
-            archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
+            checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+            archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
            title="wow1",
            filename="0000001.pdf",
            mime_type="application/pdf",
@@ -72,21 +72,21 @@ class TestExportImport(
        )
        self.d2 = Document.objects.create(
            content="Content",
-            checksum="9c9691e51741c1f4f41a20896af31770",
+            checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
            title="wow2",
            filename="0000002.pdf",
            mime_type="application/pdf",
        )
        self.d3 = Document.objects.create(
            content="Content",
-            checksum="d38d7ed02e988e072caf924e0f3fcb76",
+            checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
            title="wow2",
            filename="0000003.pdf",
            mime_type="application/pdf",
        )
        self.d4 = Document.objects.create(
            content="Content",
-            checksum="82186aaa94f0b98697d704b90fd1c072",
+            checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
            title="wow_dec",
            filename="0000004.pdf",
            mime_type="application/pdf",
@@ -239,7 +239,7 @@ class TestExportImport(
                )

                with Path(fname).open("rb") as f:
-                    checksum = hashlib.md5(f.read()).hexdigest()
+                    checksum = hashlib.sha256(f.read()).hexdigest()
                self.assertEqual(checksum, element["fields"]["checksum"])

                # Generated field "content_length" should not be exported,
@@ -253,7 +253,7 @@ class TestExportImport(
                    self.assertIsFile(fname)

                    with Path(fname).open("rb") as f:
-                        checksum = hashlib.md5(f.read()).hexdigest()
+                        checksum = hashlib.sha256(f.read()).hexdigest()
                    self.assertEqual(checksum, element["fields"]["archive_checksum"])

            elif element["model"] == "documents.note":
--- a/src/documents/tests/test_management_importer.py
+++ b/src/documents/tests/test_management_importer.py
@@ -277,8 +277,8 @@ class TestCommandImport(

        Document.objects.create(
            content="Content",
-            checksum="42995833e01aea9b3edee44bbfdd7ce1",
-            archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
+            checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+            archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
            title="wow1",
            filename="0000001.pdf",
            mime_type="application/pdf",
--- a/src/documents/utils.py
+++ b/src/documents/utils.py
@@ -1,3 +1,4 @@
+import hashlib
 import logging
 import shutil
 from os import utime
@@ -128,3 +129,28 @@ def get_boolean(boolstr: str) -> bool:
    Return a boolean value from a string representation.
    """
    return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
+
+
+def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
+    """
+    Compute the SHA-256 checksum of a file.
+
+    Reads the file in chunks to avoid loading the entire file into memory.
+
+    Args:
+        path (Path): Path to the file to hash.
+        chunk_size (int, optional): Number of bytes to read per chunk.
+            Defaults to 65536.
+
+    Returns:
+        str: Hexadecimal SHA-256 digest of the file contents.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        OSError: If the file cannot be read.
+    """
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        while chunk := f.read(chunk_size):
+            h.update(chunk)
+    return h.hexdigest()
--- a/src/paperless/parsers/registry.py
+++ b/src/paperless/parsers/registry.py
@@ -193,13 +193,11 @@ class ParserRegistry:
        that log output is predictable; scoring determines which parser wins
        at runtime regardless of registration order.
        """
-        from paperless.parsers.mail import MailDocumentParser
        from paperless.parsers.text import TextDocumentParser
        from paperless.parsers.tika import TikaDocumentParser

        self.register_builtin(TextDocumentParser)
        self.register_builtin(TikaDocumentParser)
-        self.register_builtin(MailDocumentParser)

    # ------------------------------------------------------------------
    # Discovery
--- a/src/paperless/tests/parsers/conftest.py
+++ b/src/paperless/tests/parsers/conftest.py
@@ -10,7 +10,6 @@ from typing import TYPE_CHECKING

 import pytest

-from paperless.parsers.mail import MailDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser

@@ -159,166 +158,3 @@ def tika_parser() -> Generator[TikaDocumentParser, None, None]:
    """
    with TikaDocumentParser() as parser:
        yield parser
-
-
-# ------------------------------------------------------------------
-# Mail parser sample files
-# ------------------------------------------------------------------
-
-
-@pytest.fixture(scope="session")
-def mail_samples_dir(samples_dir: Path) -> Path:
-    """Absolute path to the mail parser sample files directory.
-
-    Returns
-    -------
-    Path
-        ``<samples_dir>/mail/``
-    """
-    return samples_dir / "mail"
-
-
-@pytest.fixture(scope="session")
-def broken_email_file(mail_samples_dir: Path) -> Path:
-    """Path to a broken/malformed EML sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/broken.eml``.
-    """
-    return mail_samples_dir / "broken.eml"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_file(mail_samples_dir: Path) -> Path:
-    """Path to a plain-text email sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/simple_text.eml``.
-    """
-    return mail_samples_dir / "simple_text.eml"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_pdf_file(mail_samples_dir: Path) -> Path:
-    """Path to the expected PDF rendition of the plain-text email.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/simple_text.eml.pdf``.
-    """
-    return mail_samples_dir / "simple_text.eml.pdf"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_thumbnail_file(mail_samples_dir: Path) -> Path:
-    """Path to the expected thumbnail for the plain-text email.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/simple_text.eml.pdf.webp``.
-    """
-    return mail_samples_dir / "simple_text.eml.pdf.webp"
-
-
-@pytest.fixture(scope="session")
-def html_email_file(mail_samples_dir: Path) -> Path:
-    """Path to an HTML email sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/html.eml``.
-    """
-    return mail_samples_dir / "html.eml"
-
-
-@pytest.fixture(scope="session")
-def html_email_pdf_file(mail_samples_dir: Path) -> Path:
-    """Path to the expected PDF rendition of the HTML email.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/html.eml.pdf``.
-    """
-    return mail_samples_dir / "html.eml.pdf"
-
-
-@pytest.fixture(scope="session")
-def html_email_thumbnail_file(mail_samples_dir: Path) -> Path:
-    """Path to the expected thumbnail for the HTML email.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/html.eml.pdf.webp``.
-    """
-    return mail_samples_dir / "html.eml.pdf.webp"
-
-
-@pytest.fixture(scope="session")
-def html_email_html_file(mail_samples_dir: Path) -> Path:
-    """Path to the HTML body of the HTML email sample.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/html.eml.html``.
-    """
-    return mail_samples_dir / "html.eml.html"
-
-
-@pytest.fixture(scope="session")
-def merged_pdf_first(mail_samples_dir: Path) -> Path:
-    """Path to the first PDF used in PDF-merge tests.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/first.pdf``.
-    """
-    return mail_samples_dir / "first.pdf"
-
-
-@pytest.fixture(scope="session")
-def merged_pdf_second(mail_samples_dir: Path) -> Path:
-    """Path to the second PDF used in PDF-merge tests.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/second.pdf``.
-    """
-    return mail_samples_dir / "second.pdf"
-
-
-# ------------------------------------------------------------------
-# Mail parser instance
-# ------------------------------------------------------------------
-
-
-@pytest.fixture()
-def mail_parser() -> Generator[MailDocumentParser, None, None]:
-    """Yield a MailDocumentParser and clean up its temporary directory afterwards.
-
-    Yields
-    ------
-    MailDocumentParser
-        A ready-to-use parser instance.
-    """
-    with MailDocumentParser() as parser:
-        yield parser
-
-
-@pytest.fixture(scope="session")
-def nginx_base_url() -> Generator[str, None, None]:
-    """
-    The base URL for the nginx HTTP server we expect to be alive
-    """
-    yield "http://localhost:8080"
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -1,26 +1,6 @@
-"""
-Built-in mail document parser.
-
-Handles message/rfc822 (EML) MIME type by:
- Parsing the email using imap_tools
- Generating a PDF via Gotenberg (for display and archive)
- Extracting text via Tika for HTML content
- Extracting metadata from email headers
-
-The parser always produces a PDF because EML files cannot be rendered
-natively in a browser (requires_pdf_rendition=True).
-"""
-
-from __future__ import annotations
-
-import logging
 import re
-import shutil
-import tempfile
 from html import escape
 from pathlib import Path
-from typing import TYPE_CHECKING
-from typing import Self

 from bleach import clean
 from bleach import linkify
@@ -39,353 +19,65 @@ from imap_tools import MailAttachment
 from imap_tools import MailMessage
 from tika_client import TikaClient

+from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from paperless.models import OutputTypeChoices
-from paperless.version import __full_version_str__
 from paperless_mail.models import MailRule

-if TYPE_CHECKING:
-    import datetime
-    from types import TracebackType

-    from paperless.parsers import MetadataEntry
-
-logger = logging.getLogger("paperless.parsing.mail")
-
-_SUPPORTED_MIME_TYPES: dict[str, str] = {
-    "message/rfc822": ".eml",
-}
-
-
-class MailDocumentParser:
-    """Parse .eml email files for Paperless-ngx.
-
-    Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
-    and sends the HTML part to a Tika server for text extraction.  Because
-    EML files cannot be rendered natively in a browser, the parser always
-    produces a PDF rendition (requires_pdf_rendition=True).
-
-    The mailrule_id instance attribute may be set by the consumer before
-    calling parse() to apply mail-rule-specific PDF layout options:
-
-        parser.mailrule_id = rule.pk
-        parser.parse(path, mime_type)
-
-    Class attributes
-    ----------------
-    name : str
-        Human-readable parser name.
-    version : str
-        Semantic version string, kept in sync with Paperless-ngx releases.
-    author : str
-        Maintainer name.
-    url : str
-        Issue tracker / source URL.
+class MailDocumentParser(DocumentParser):
+    """
+    This parser uses imap_tools to parse .eml files, generates pdf using
+    Gotenberg and sends the html part to a Tika server for text extraction.
    """

-    name: str = "Paperless-ngx Mail Parser"
-    version: str = __full_version_str__
-    author: str = "Paperless-ngx Contributors"
-    url: str = "https://github.com/paperless-ngx/paperless-ngx"
+    logging_name = "paperless.parsing.mail"

-    # ------------------------------------------------------------------
-    # Class methods
-    # ------------------------------------------------------------------
-
-    @classmethod
-    def supported_mime_types(cls) -> dict[str, str]:
-        """Return the MIME types this parser handles.
-
-        Returns
-        -------
-        dict[str, str]
-            Mapping of MIME type to preferred file extension.
+    def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
        """
-        return _SUPPORTED_MIME_TYPES
-
-    @classmethod
-    def score(
-        cls,
-        mime_type: str,
-        filename: str,
-        path: Path | None = None,
-    ) -> int | None:
-        """Return the priority score for handling this file.
-
-        Parameters
-        ----------
-        mime_type:
-            Detected MIME type of the file.
-        filename:
-            Original filename including extension.
-        path:
-            Optional filesystem path. Not inspected by this parser.
-
-        Returns
-        -------
-        int | None
-            20 if the MIME type is supported (higher than the default 10 to
-            give the mail parser clear priority), otherwise None.
+        Converts our requested PDF/A output into the Gotenberg API
+        format
        """
-        if mime_type in _SUPPORTED_MIME_TYPES:
-            return 20
+        if settings.OCR_OUTPUT_TYPE in {
+            OutputTypeChoices.PDF_A,
+            OutputTypeChoices.PDF_A2,
+        }:
+            return PdfAFormat.A2b
+        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:  # pragma: no cover
+            self.log.warning(
+                "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
+            )
+            return PdfAFormat.A2b
+        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:  # pragma: no cover
+            return PdfAFormat.A3b
        return None

-    # ------------------------------------------------------------------
-    # Properties
-    # ------------------------------------------------------------------
-
-    @property
-    def can_produce_archive(self) -> bool:
-        """Whether this parser can produce a searchable PDF archive copy.
-
-        Returns
-        -------
-        bool
-            Always False — the mail parser produces a display PDF
-            (requires_pdf_rendition=True), not an optional OCR archive.
-        """
-        return False
-
-    @property
-    def requires_pdf_rendition(self) -> bool:
-        """Whether the parser must produce a PDF for the frontend to display.
-
-        Returns
-        -------
-        bool
-            Always True — EML files cannot be rendered natively in a browser,
-            so a PDF conversion is always required for display.
-        """
-        return True
-
-    # ------------------------------------------------------------------
-    # Lifecycle
-    # ------------------------------------------------------------------
-
-    def __init__(self, logging_group: object = None) -> None:
-        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
-        self._tempdir = Path(
-            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
-        )
-        self._text: str | None = None
-        self._date: datetime.datetime | None = None
-        self._archive_path: Path | None = None
-        self.mailrule_id: int | None = None
-
-    def __enter__(self) -> Self:
-        return self
-
-    def __exit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc_val: BaseException | None,
-        exc_tb: TracebackType | None,
-    ) -> None:
-        logger.debug("Cleaning up temporary directory %s", self._tempdir)
-        shutil.rmtree(self._tempdir, ignore_errors=True)
-
-    # ------------------------------------------------------------------
-    # Core parsing interface
-    # ------------------------------------------------------------------
-
-    def parse(
-        self,
-        document_path: Path,
-        mime_type: str,
-        *,
-        produce_archive: bool = True,
-    ) -> None:
-        """Parse the given .eml into formatted text and a PDF archive.
-
-        The consumer may set ``self.mailrule_id`` before calling this method
-        to apply mail-rule-specific PDF layout options.  The ``produce_archive``
-        flag is accepted for protocol compatibility but is always honoured —
-        the mail parser always produces a PDF since EML files cannot be
-        displayed natively.
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the .eml file.
-        mime_type:
-            Detected MIME type of the document (should be "message/rfc822").
-        produce_archive:
-            Accepted for protocol compatibility. The PDF rendition is always
-            produced since EML files cannot be displayed natively in a browser.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If the file cannot be parsed or PDF generation fails.
-        """
-
-        def strip_text(text: str) -> str:
-            """Reduces the spacing of the given text string."""
-            text = re.sub(r"\s+", " ", text)
-            text = re.sub(r"(\n *)+", "\n", text)
-            return text.strip()
-
-        def build_formatted_text(mail_message: MailMessage) -> str:
-            """Constructs a formatted string based on the given email."""
-            fmt_text = f"Subject: {mail_message.subject}\n\n"
-            fmt_text += f"From: {mail_message.from_values.full}\n\n"
-            to_list = [address.full for address in mail_message.to_values]
-            fmt_text += f"To: {', '.join(to_list)}\n\n"
-            if mail_message.cc_values:
-                fmt_text += (
-                    f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
-                )
-            if mail_message.bcc_values:
-                fmt_text += (
-                    f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
-                )
-            if mail_message.attachments:
-                att = []
-                for a in mail.attachments:
-                    attachment_size = naturalsize(a.size, binary=True, format="%.2f")
-                    att.append(
-                        f"{a.filename} ({attachment_size})",
-                    )
-                fmt_text += f"Attachments: {', '.join(att)}\n\n"
-
-            if mail.html:
-                fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
-
-            fmt_text += f"\n\n{strip_text(mail.text)}"
-
-            return fmt_text
-
-        logger.debug("Parsing file %s into an email", document_path.name)
-        mail = self.parse_file_to_message(document_path)
-
-        logger.debug("Building formatted text from email")
-        self._text = build_formatted_text(mail)
-
-        if is_naive(mail.date):
-            self._date = make_aware(mail.date)
-        else:
-            self._date = mail.date
-
-        logger.debug("Creating a PDF from the email")
-        if self.mailrule_id:
-            rule = MailRule.objects.get(pk=self.mailrule_id)
-            self._archive_path = self.generate_pdf(mail, rule.pdf_layout)
-        else:
-            self._archive_path = self.generate_pdf(mail)
-
-    # ------------------------------------------------------------------
-    # Result accessors
-    # ------------------------------------------------------------------
-
-    def get_text(self) -> str | None:
-        """Return the plain-text content extracted during parse.
-
-        Returns
-        -------
-        str | None
-            Extracted text, or None if parse has not been called yet.
-        """
-        return self._text
-
-    def get_date(self) -> datetime.datetime | None:
-        """Return the document date detected during parse.
-
-        Returns
-        -------
-        datetime.datetime | None
-            Date from the email headers, or None if not detected.
-        """
-        return self._date
-
-    def get_archive_path(self) -> Path | None:
-        """Return the path to the generated archive PDF, or None.
-
-        Returns
-        -------
-        Path | None
-            Path to the PDF produced by Gotenberg, or None if parse has not
-            been called yet.
-        """
-        return self._archive_path
-
-    # ------------------------------------------------------------------
-    # Thumbnail and metadata
-    # ------------------------------------------------------------------
-
    def get_thumbnail(
        self,
        document_path: Path,
        mime_type: str,
-        file_name: str | None = None,
+        file_name=None,
    ) -> Path:
-        """Generate a thumbnail from the PDF rendition of the email.
-
-        Converts the document to PDF first if not already done.
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the source document.
-        mime_type:
-            Detected MIME type of the document.
-        file_name:
-            Kept for backward compatibility; not used.
-
-        Returns
-        -------
-        Path
-            Path to the generated WebP thumbnail inside the temporary directory.
-        """
-        if not self._archive_path:
-            self._archive_path = self.generate_pdf(
+        if not self.archive_path:
+            self.archive_path = self.generate_pdf(
                self.parse_file_to_message(document_path),
            )

        return make_thumbnail_from_pdf(
-            self._archive_path,
-            self._tempdir,
+            self.archive_path,
+            self.tempdir,
+            self.logging_group,
        )

-    def get_page_count(
-        self,
-        document_path: Path,
-        mime_type: str,
-    ) -> int | None:
-        """Return the number of pages in the document.
-
-        Returns
-        -------
-        int | None
-            Always None — page count is not available for email files.
-        """
-        return None
-
-    def extract_metadata(
-        self,
-        document_path: Path,
-        mime_type: str,
-    ) -> list[MetadataEntry]:
-        """Extract metadata from the email headers.
-
-        Returns email headers as metadata entries with prefix "header",
-        plus summary entries for attachments and date.
-
-        Returns
-        -------
-        list[MetadataEntry]
-            Sorted list of metadata entries, or ``[]`` on parse failure.
-        """
-        result: list[MetadataEntry] = []
+    def extract_metadata(self, document_path: Path, mime_type: str):
+        result = []

        try:
            mail = self.parse_file_to_message(document_path)
        except ParseError as e:
-            logger.warning(
-                "Error while fetching document metadata for %s: %s",
-                document_path,
-                e,
+            self.log.warning(
+                f"Error while fetching document metadata for {document_path}: {e}",
            )
            return result

@@ -394,7 +86,7 @@ class MailDocumentParser:
            try:
                value.encode("utf-8")
            except UnicodeEncodeError as e:  # pragma: no cover
-                logger.debug("Skipping header %s: %s", key, e)
+                self.log.debug(f"Skipping header {key}: {e}")
                continue

            result.append(
@@ -431,44 +123,81 @@ class MailDocumentParser:
        result.sort(key=lambda item: (item["prefix"], item["key"]))
        return result

-    # ------------------------------------------------------------------
-    # Email-specific methods
-    # ------------------------------------------------------------------
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        file_name=None,
+        mailrule_id: int | None = None,
+    ) -> None:
+        """
+        Parses the given .eml into formatted text, based on the decoded email.

-    def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
-        """Convert the OCR output type setting to a Gotenberg PdfAFormat."""
-        if settings.OCR_OUTPUT_TYPE in {
-            OutputTypeChoices.PDF_A,
-            OutputTypeChoices.PDF_A2,
-        }:
-            return PdfAFormat.A2b
-        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:  # pragma: no cover
-            logger.warning(
-                "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
-            )
-            return PdfAFormat.A2b
-        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:  # pragma: no cover
-            return PdfAFormat.A3b
-        return None
+        """
+
+        def strip_text(text: str):
+            """
+            Reduces the spacing of the given text string
+            """
+            text = re.sub(r"\s+", " ", text)
+            text = re.sub(r"(\n *)+", "\n", text)
+            return text.strip()
+
+        def build_formatted_text(mail_message: MailMessage) -> str:
+            """
+            Constructs a formatted string, based on the given email.  Basically tries
+            to get most of the email content, included front matter, into a nice string
+            """
+            fmt_text = f"Subject: {mail_message.subject}\n\n"
+            fmt_text += f"From: {mail_message.from_values.full}\n\n"
+            to_list = [address.full for address in mail_message.to_values]
+            fmt_text += f"To: {', '.join(to_list)}\n\n"
+            if mail_message.cc_values:
+                fmt_text += (
+                    f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
+                )
+            if mail_message.bcc_values:
+                fmt_text += (
+                    f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
+                )
+            if mail_message.attachments:
+                att = []
+                for a in mail.attachments:
+                    attachment_size = naturalsize(a.size, binary=True, format="%.2f")
+                    att.append(
+                        f"{a.filename} ({attachment_size})",
+                    )
+                fmt_text += f"Attachments: {', '.join(att)}\n\n"
+
+            if mail.html:
+                fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
+
+            fmt_text += f"\n\n{strip_text(mail.text)}"
+
+            return fmt_text
+
+        self.log.debug(f"Parsing file {document_path.name} into an email")
+        mail = self.parse_file_to_message(document_path)
+
+        self.log.debug("Building formatted text from email")
+        self.text = build_formatted_text(mail)
+
+        if is_naive(mail.date):
+            self.date = make_aware(mail.date)
+        else:
+            self.date = mail.date
+
+        self.log.debug("Creating a PDF from the email")
+        if mailrule_id:
+            rule = MailRule.objects.get(pk=mailrule_id)
+            self.archive_path = self.generate_pdf(mail, rule.pdf_layout)
+        else:
+            self.archive_path = self.generate_pdf(mail)

    @staticmethod
    def parse_file_to_message(filepath: Path) -> MailMessage:
-        """Parse the given .eml file into a MailMessage object.
-
-        Parameters
-        ----------
-        filepath:
-            Path to the .eml file.
-
-        Returns
-        -------
-        MailMessage
-            Parsed mail message.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If the file cannot be parsed or is missing required fields.
+        """
+        Parses the given .eml file into a MailMessage object
        """
        try:
            with filepath.open("rb") as eml:
@@ -484,25 +213,8 @@ class MailDocumentParser:

        return parsed

-    def tika_parse(self, html: str) -> str:
-        """Send HTML content to the Tika server for text extraction.
-
-        Parameters
-        ----------
-        html:
-            HTML string to parse.
-
-        Returns
-        -------
-        str
-            Extracted plain text.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If the Tika server cannot be reached or returns an error.
-        """
-        logger.info("Sending content to Tika server")
+    def tika_parse(self, html: str):
+        self.log.info("Sending content to Tika server")

        try:
            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
@@ -522,32 +234,16 @@ class MailDocumentParser:
        mail_message: MailMessage,
        pdf_layout: MailRule.PdfLayout | None = None,
    ) -> Path:
-        """Generate a PDF from the email message.
-
-        Creates separate PDFs for the email body and HTML content, then
-        merges them according to the requested layout.
-
-        Parameters
-        ----------
-        mail_message:
-            Parsed email message.
-        pdf_layout:
-            Layout option for the PDF. Falls back to the
-            EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
-
-        Returns
-        -------
-        Path
-            Path to the generated PDF inside the temporary directory.
-        """
-        archive_path = Path(self._tempdir) / "merged.pdf"
+        archive_path = Path(self.tempdir) / "merged.pdf"

        mail_pdf_file = self.generate_pdf_from_mail(mail_message)

-        pdf_layout = pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
+        pdf_layout = (
+            pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
+        )  # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout

-        # If no HTML content, create the PDF from the message.
-        # Otherwise, create 2 PDFs and merge them with Gotenberg.
+        # If no HTML content, create the PDF from the message
+        # Otherwise, create 2 PDFs and merge them with Gotenberg
        if not mail_message.html:
            archive_path.write_bytes(mail_pdf_file.read_bytes())
        else:
@@ -556,7 +252,7 @@ class MailDocumentParser:
                mail_message.attachments,
            )

-            logger.debug("Merging email text and HTML content into single PDF")
+            self.log.debug("Merging email text and HTML content into single PDF")

            with (
                GotenbergClient(
@@ -591,21 +287,15 @@ class MailDocumentParser:
        return archive_path

    def mail_to_html(self, mail: MailMessage) -> Path:
-        """Convert the given email into an HTML file using a template.
-
-        Parameters
-        ----------
-        mail:
-            Parsed mail message.
-
-        Returns
-        -------
-        Path
-            Path to the rendered HTML file inside the temporary directory.
+        """
+        Converts the given email into an HTML file, formatted
+        based on the given template
        """

        def clean_html(text: str) -> str:
-            """Attempt to clean, escape, and linkify the given HTML string."""
+            """
+            Attempts to clean, escape and linkify the given HTML string
+            """
            if isinstance(text, list):
                text = "\n".join([str(e) for e in text])
            if not isinstance(text, str):
@@ -650,37 +340,19 @@ class MailDocumentParser:

        from django.template.loader import render_to_string

-        html_file = Path(self._tempdir) / "email_as_html.html"
+        html_file = Path(self.tempdir) / "email_as_html.html"
        html_file.write_text(render_to_string("email_msg_template.html", context=data))

        return html_file

    def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
-        """Create a PDF from the email body using an HTML template and Gotenberg.
-
-        Parameters
-        ----------
-        mail:
-            Parsed mail message.
-
-        Returns
-        -------
-        Path
-            Path to the generated PDF inside the temporary directory.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If Gotenberg returns an error.
        """
-        logger.info("Converting mail to PDF")
+        Creates a PDF based on the given email, using the email's values in a
+        an HTML template
+        """
+        self.log.info("Converting mail to PDF")

-        css_file = (
-            Path(__file__).parent.parent.parent
-            / "paperless_mail"
-            / "templates"
-            / "output.css"
-        )
+        css_file = Path(__file__).parent / "templates" / "output.css"
        email_html_file = self.mail_to_html(mail)

        with (
@@ -716,7 +388,7 @@ class MailDocumentParser:
                    f"Error while converting email to PDF: {err}",
                ) from err

-        email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
+        email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
        email_as_pdf_file.write_bytes(response.content)

        return email_as_pdf_file
@@ -726,27 +398,11 @@ class MailDocumentParser:
        orig_html: str,
        attachments: list[MailAttachment],
    ) -> Path:
-        """Generate a PDF from the HTML content of the email.
-
-        Parameters
-        ----------
-        orig_html:
-            Raw HTML string from the email body.
-        attachments:
-            List of email attachments (used as inline resources).
-
-        Returns
-        -------
-        Path
-            Path to the generated PDF inside the temporary directory.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If Gotenberg returns an error.
+        """
+        Generates a PDF file based on the HTML and attachments of the email
        """

-        def clean_html_script(text: str) -> str:
+        def clean_html_script(text: str):
            compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
            text = compiled_open.sub("<div hidden ", text)

@@ -754,9 +410,9 @@ class MailDocumentParser:
            text = compiled_close.sub("</div", text)
            return text

-        logger.info("Converting message html to PDF")
+        self.log.info("Converting message html to PDF")

-        tempdir = Path(self._tempdir)
+        tempdir = Path(self.tempdir)

        html_clean = clean_html_script(orig_html)
        html_clean_file = tempdir / "index.html"
@@ -817,3 +473,9 @@ class MailDocumentParser:
        html_pdf = tempdir / "html.pdf"
        html_pdf.write_bytes(response.content)
        return html_pdf
+
+    def get_settings(self) -> None:
+        """
+        This parser does not implement additional settings yet
+        """
+        return None
--- a/src/paperless_mail/signals.py
+++ b/src/paperless_mail/signals.py
@@ -1,12 +1,7 @@
 def get_parser(*args, **kwargs):
-    from paperless.parsers.mail import MailDocumentParser
+    from paperless_mail.parsers import MailDocumentParser

-    # MailDocumentParser accepts no constructor args in the new-style protocol.
-    # Pop legacy args that arrive from the signal-based consumer path.
-    # Phase 4 will replace this signal path with the ParserRegistry.
-    kwargs.pop("logging_group", None)
-    kwargs.pop("progress_callback", None)
-    return MailDocumentParser()
+    return MailDocumentParser(*args, **kwargs)


 def mail_consumer_declaration(sender, **kwargs):
--- a/src/paperless_mail/tests/conftest.py
+++ b/src/paperless_mail/tests/conftest.py
@@ -1,9 +1,71 @@
 from collections.abc import Generator
+from pathlib import Path

 import pytest

 from paperless_mail.mail import MailAccountHandler
 from paperless_mail.models import MailAccount
+from paperless_mail.parsers import MailDocumentParser
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+    return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture(scope="session")
+def broken_email_file(sample_dir: Path) -> Path:
+    return sample_dir / "broken.eml"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_file(sample_dir: Path) -> Path:
+    return sample_dir / "simple_text.eml"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
+    return sample_dir / "simple_text.eml.pdf"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
+    return sample_dir / "simple_text.eml.pdf.webp"
+
+
+@pytest.fixture(scope="session")
+def html_email_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml"
+
+
+@pytest.fixture(scope="session")
+def html_email_pdf_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml.pdf"
+
+
+@pytest.fixture(scope="session")
+def html_email_thumbnail_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml.pdf.webp"
+
+
+@pytest.fixture(scope="session")
+def html_email_html_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml.html"
+
+
+@pytest.fixture(scope="session")
+def merged_pdf_first(sample_dir: Path) -> Path:
+    return sample_dir / "first.pdf"
+
+
+@pytest.fixture(scope="session")
+def merged_pdf_second(sample_dir: Path) -> Path:
+    return sample_dir / "second.pdf"
+
+
+@pytest.fixture()
+def mail_parser() -> MailDocumentParser:
+    return MailDocumentParser(logging_group=None)


@pytest.fixture()
@@ -27,3 +89,11 @@ def greenmail_mail_account(db: None) -> Generator[MailAccount, None, None]:
@pytest.fixture()
 def mail_account_handler() -> MailAccountHandler:
    return MailAccountHandler()
+
+
+@pytest.fixture(scope="session")
+def nginx_base_url() -> Generator[str, None, None]:
+    """
+    The base URL for the nginx HTTP server we expect to be alive
+    """
+    yield "http://localhost:8080"
--- a/src/paperless_mail/tests/samples/broken.eml
+++ b/src/paperless_mail/tests/samples/broken.eml
--- a/src/paperless_mail/tests/samples/first.pdf
+++ b/src/paperless_mail/tests/samples/first.pdf
--- a/src/paperless_mail/tests/samples/html.eml
+++ b/src/paperless_mail/tests/samples/html.eml
--- a/src/paperless_mail/tests/samples/html.eml.html
+++ b/src/paperless_mail/tests/samples/html.eml.html
--- a/src/paperless_mail/tests/samples/html.eml.pdf
+++ b/src/paperless_mail/tests/samples/html.eml.pdf
--- a/src/paperless_mail/tests/samples/html.eml.pdf.webp
+++ b/src/paperless_mail/tests/samples/html.eml.pdf.webp
--- a/src/paperless_mail/tests/samples/sample.html
+++ b/src/paperless_mail/tests/samples/sample.html
--- a/src/paperless_mail/tests/samples/sample.html.pdf
+++ b/src/paperless_mail/tests/samples/sample.html.pdf
--- a/src/paperless_mail/tests/samples/sample.html.pdf.webp
+++ b/src/paperless_mail/tests/samples/sample.html.pdf.webp
--- a/src/paperless_mail/tests/samples/sample.png
+++ b/src/paperless_mail/tests/samples/sample.png
--- a/src/paperless_mail/tests/samples/second.pdf
+++ b/src/paperless_mail/tests/samples/second.pdf
--- a/src/paperless_mail/tests/samples/simple_text.eml
+++ b/src/paperless_mail/tests/samples/simple_text.eml
--- a/src/paperless_mail/tests/samples/simple_text.eml.pdf
+++ b/src/paperless_mail/tests/samples/simple_text.eml.pdf
--- a/src/paperless_mail/tests/samples/simple_text.eml.pdf.webp
+++ b/src/paperless_mail/tests/samples/simple_text.eml.pdf.webp
--- a/src/paperless/tests/parsers/test_mail_parser.py
+++ b/src/paperless/tests/parsers/test_mail_parser.py
@@ -12,7 +12,7 @@ from pytest_httpx import HTTPXMock
 from pytest_mock import MockerFixture

 from documents.parsers import ParseError
-from paperless.parsers.mail import MailDocumentParser
+from paperless_mail.parsers import MailDocumentParser


 class TestEmailFileParsing:
@@ -24,7 +24,7 @@ class TestEmailFileParsing:
    def test_parse_error_missing_file(
        self,
        mail_parser: MailDocumentParser,
-        mail_samples_dir: Path,
+        sample_dir: Path,
    ) -> None:
        """
        GIVEN:
@@ -35,7 +35,7 @@ class TestEmailFileParsing:
            - An Exception is thrown
        """
        # Check if exception is raised when parsing fails.
-        test_file = mail_samples_dir / "doesntexist.eml"
+        test_file = sample_dir / "doesntexist.eml"

        assert not test_file.exists()

@@ -246,12 +246,12 @@ class TestEmailThumbnailGenerate:
        """
        mocked_return = "Passing the return value through.."
        mock_make_thumbnail_from_pdf = mocker.patch(
-            "paperless.parsers.mail.make_thumbnail_from_pdf",
+            "paperless_mail.parsers.make_thumbnail_from_pdf",
        )
        mock_make_thumbnail_from_pdf.return_value = mocked_return

        mock_generate_pdf = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
        )
        mock_generate_pdf.return_value = "Mocked return value.."

@@ -260,7 +260,8 @@ class TestEmailThumbnailGenerate:
        mock_generate_pdf.assert_called_once()
        mock_make_thumbnail_from_pdf.assert_called_once_with(
            "Mocked return value..",
-            mail_parser._tempdir,
+            mail_parser.tempdir,
+            None,
        )

        assert mocked_return == thumb
@@ -372,7 +373,7 @@ class TestParser:
        """
        # Validate parsing returns the expected results
        mock_generate_pdf = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
        )

        mail_parser.parse(simple_txt_email_file, "message/rfc822")
@@ -384,7 +385,7 @@ class TestParser:
            "BCC: fdf@fvf.de\n\n"
            "\n\nThis is just a simple Text Mail."
        )
-        assert text_expected == mail_parser.get_text()
+        assert text_expected == mail_parser.text
        assert (
            datetime.datetime(
                2022,
@@ -395,7 +396,7 @@ class TestParser:
                43,
                tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
            )
-            == mail_parser.get_date()
+            == mail_parser.date
        )

        # Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
@@ -418,7 +419,7 @@ class TestParser:
        """

        mock_generate_pdf = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
        )

        # Validate parsing returns the expected results
@@ -442,7 +443,7 @@ class TestParser:
        mail_parser.parse(html_email_file, "message/rfc822")

        mock_generate_pdf.assert_called_once()
-        assert text_expected == mail_parser.get_text()
+        assert text_expected == mail_parser.text
        assert (
            datetime.datetime(
                2022,
@@ -453,7 +454,7 @@ class TestParser:
                19,
                tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
            )
-            == mail_parser.get_date()
+            == mail_parser.date
        )

    def test_generate_pdf_parse_error(
@@ -500,7 +501,7 @@ class TestParser:

        mail_parser.parse(simple_txt_email_file, "message/rfc822")

-        assert mail_parser.get_archive_path() is not None
+        assert mail_parser.archive_path is not None

    @pytest.mark.httpx_mock(can_send_already_matched_responses=True)
    def test_generate_pdf_html_email(
@@ -541,7 +542,7 @@ class TestParser:
        )
        mail_parser.parse(html_email_file, "message/rfc822")

-        assert mail_parser.get_archive_path() is not None
+        assert mail_parser.archive_path is not None

    def test_generate_pdf_html_email_html_to_pdf_failure(
        self,
@@ -711,10 +712,10 @@ class TestParser:

        def test_layout_option(layout_option, expected_calls, expected_pdf_names):
            mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
-            mail_parser.mailrule_id = 1
            mail_parser.parse(
                document_path=html_email_file,
                mime_type="message/rfc822",
+                mailrule_id=1,
            )
            args, _ = mock_merge_route.call_args
            assert len(args[0]) == expected_calls
--- a/src/paperless/tests/parsers/test_mail_parser_live.py
+++ b/src/paperless/tests/parsers/test_mail_parser_live.py
@@ -11,7 +11,7 @@ from PIL import Image
 from pytest_mock import MockerFixture

 from documents.tests.utils import util_call_with_backoff
-from paperless.parsers.mail import MailDocumentParser
+from paperless_mail.parsers import MailDocumentParser


 def extract_text(pdf_path: Path) -> str:
@@ -159,7 +159,7 @@ class TestParserLive:
            - The returned thumbnail image file shall match the expected hash
        """
        mock_generate_pdf = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
        )
        mock_generate_pdf.return_value = simple_txt_email_pdf_file

@@ -216,10 +216,10 @@ class TestParserLive:
            - The merged PDF shall contain text from both source PDFs
        """
        mock_generate_pdf_from_html = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf_from_html",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
        )
        mock_generate_pdf_from_mail = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf_from_mail",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
        )
        mock_generate_pdf_from_mail.return_value = merged_pdf_first
        mock_generate_pdf_from_html.return_value = merged_pdf_second
--- a/uv.lock
+++ b/uv.lock
@@ -5643,7 +5643,7 @@ wheels = [

 [[package]]
 name = "zensical"
-version = "0.0.26"
+version = "0.0.25"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -5653,18 +5653,16 @@ dependencies = [
    { name = "pymdown-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "pyyaml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/d5/1f/0a0b1ce8e0553a9dabaedc736d0f34b11fc33d71ff46bce44d674996d41f/zensical-0.0.26.tar.gz", hash = "sha256:f4d9c8403df25fbb3d6dd9577122dc2f23c73a2d16ab778bb7d40370dd71e987", size = 3841473, upload-time = "2026-03-11T09:51:38.838Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/18/69/4b49ce778059b4888ea854cf4db40e1b2080fe828b7280198999048d6fce/zensical-0.0.25.tar.gz", hash = "sha256:462808359d949469fa7209d367f2e38ed796744074e5dadeac9ddfef0c44be25", size = 3841318, upload-time = "2026-03-10T19:32:35.048Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/41/58/fa3d9538ff1ea8cf4a193edbf47254f374fa7983fcfa876bb4336d72c53a/zensical-0.0.26-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:7823b25afe7d36099253aa59d643abaac940f80fd015d4a37954210c87d3da56", size = 12263607, upload-time = "2026-03-11T09:50:49.202Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/6e/44a3b21bd3569b9cad203364d73a956768d28a879e4c2be91bd889f74d2c/zensical-0.0.26-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:c0254814382cdd3769bc7689180d09bf41de8879871dd736dc52d5f141e8ada7", size = 12144562, upload-time = "2026-03-11T09:50:53.685Z" },
-    { url = "https://files.pythonhosted.org/packages/07/ae/31b9885745b3e7ef23a3ae7f175b879807288d11b3fb7e2d3c119c916258/zensical-0.0.26-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c8e601b2bbd239e564b04cf235eefb9777e7dfc7e1857b8871d6cdcfb577aa0", size = 12506728, upload-time = "2026-03-11T09:50:57.775Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/93/f5291e2c47076474f181f6eef35ef0428117d3f192da4358c0511e2ce09e/zensical-0.0.26-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2dc43c7e6c25d9724fc0450f0273ca4e5e2506eeb7f89f52f1405a592896ca3b", size = 12454975, upload-time = "2026-03-11T09:51:01.514Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/2e/61cac4f2ebad31dab768eb02753ffde9e56d4d34b8f876b949bf516fbd50/zensical-0.0.26-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24ed236d1254cc474c19227eaa3670a1ccf921af53134ec5542b05853bdcd59c", size = 12791930, upload-time = "2026-03-11T09:51:05.162Z" },
-    { url = "https://files.pythonhosted.org/packages/02/86/51995d1ed2dd6ad8a1a70bcdf3c5eb16b50e62ea70e638d454a6b9061c4d/zensical-0.0.26-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1110147710d1dd025d932c4a7eada836bdf079c91b70fb0ae5b202e14b094617", size = 12548166, upload-time = "2026-03-11T09:51:09.218Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/93/decbafdbfc77170cbc3851464632390846e9aaf45e743c8dd5a24d5673e9/zensical-0.0.26-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:7d21596a785428cdebc20859bd94a05334abe14ad24f1bb9cd80d19219e3c220", size = 12682103, upload-time = "2026-03-11T09:51:12.68Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/e2/391d2d08dde621177da069a796a886b549fefb15734aeeb6e696af99b662/zensical-0.0.26-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:680a3c7bb71499b4da784d6072e44b3d7b8c0df3ce9bbd9974e24bd8058c2736", size = 12724219, upload-time = "2026-03-11T09:51:17.32Z" },
-    { url = "https://files.pythonhosted.org/packages/80/2a/21b40c5c40a67da8a841f278d61dbd8d5e035e489de6fe1cef5f4e211b4f/zensical-0.0.26-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:e3294a79f98218b6fc2219232e166aa0932ae4dad58f6c8dbc0dbe0ecbff9c25", size = 12862117, upload-time = "2026-03-11T09:51:22.161Z" },
-    { url = "https://files.pythonhosted.org/packages/51/76/e1910d6d75d207654c867b8efbda6822dedda9fed3601bf4a864a1f4fe26/zensical-0.0.26-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:630229587df1fb47be184a4a69d0772ce59a44cd2c481ae9f7e8852fffaff11e", size = 12815714, upload-time = "2026-03-11T09:51:26.24Z" },
+    { url = "https://files.pythonhosted.org/packages/42/7c/f6f5eb1903b5a557d98f48d09e3d4bc33033ed78508986250dabe5387d91/zensical-0.0.25-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3c481dd16a968f97d43f6b596e10e941d8294ed446b8b117235a6b149c0d6965", size = 12263809, upload-time = "2026-03-10T19:31:49.907Z" },
+    { url = "https://files.pythonhosted.org/packages/37/b2/3f8be43526a68c52c84f099887d1903c2526a22aa4344378a72671bf6070/zensical-0.0.25-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ae51751e8b11f50df04641b40c1e07d4b703fed9d9548b16dbcb0cf260da229a", size = 12146107, upload-time = "2026-03-10T19:31:53.576Z" },
+    { url = "https://files.pythonhosted.org/packages/16/59/89a3a715b1fe538b4b5ee382d71b86bd06d4f351383e36eefd36e824c150/zensical-0.0.25-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56ccf88245bd0b3684bf313384164972f1890802d4a51dd9b7ae6ea126a810bc", size = 12505963, upload-time = "2026-03-10T19:31:57.517Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/5b/cc0bada291818bdf36be777af9c16f655a021f16578a31e6fb233affca03/zensical-0.0.25-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2f4e58bcc06f3e50cc518666a0c9d8f82246255a42b37bb1d7c7343e214fbac", size = 12455496, upload-time = "2026-03-10T19:32:02.37Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/16/ff91ee42d8b14a1b63e2e0d74922e6c4b0ec1da3819377f20b7ca2742f76/zensical-0.0.25-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:69895273b1319a45667abac543c3e5065ff2a646d9a698eae056b6a35b57e00a", size = 12683609, upload-time = "2026-03-10T19:32:06.144Z" },
+    { url = "https://files.pythonhosted.org/packages/01/fd/a85acc4234d31658f4bb54c4900edfc8d4227ad83e4c79de92cfdcd05c79/zensical-0.0.25-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:c51a00ae1de2e9647bfd0ea1965b223fb3891111a00930416e1277e06f3ab3c4", size = 12725420, upload-time = "2026-03-10T19:32:09.938Z" },
+    { url = "https://files.pythonhosted.org/packages/37/c7/896c91e457af3d5769d8d70d2bd66a8a287ad129879b51ab5e985ac68889/zensical-0.0.25-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:28e56ec1f06ea66227c1f5af9d7a6ed3bd4246e6af1e45d29e09f40251b52e1f", size = 12861970, upload-time = "2026-03-10T19:32:13.471Z" },
+    { url = "https://files.pythonhosted.org/packages/41/06/5d804cf19e4e093394674d9f213546dc1364a34fd85d81a1153b05733c5a/zensical-0.0.25-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d2d5997baad148b65eb0de6baf81973110538e01a3f64467d06d0c5ac23b0d70", size = 12816321, upload-time = "2026-03-10T19:32:17.031Z" },
 ]

 [[package]]
Author	SHA1	Message	Date
Trenton H	6a4da4c46e	Adds a docstring that an IDE will render better	2026-03-18 15:26:10 -07:00
Trenton H	741486df16	Handles the rename of the migration	2026-03-18 15:23:37 -07:00
Trenton H	cdbb118f1c	Fixes logging so I can see it	2026-03-18 15:22:18 -07:00
Trenton H	cea5971ad8	Batch based iteration and bulk updates, with chunked file reading	2026-03-18 15:22:18 -07:00
Trenton H	156ee4e2ee	Transitions to SHA256 based checksums	2026-03-18 15:22:17 -07:00