Adds a docstring that an IDE will render better

Handles the rename of the migration
Fixes logging so I can see it
2026-03-19 23:45:57 +00:00 · 2026-03-18 15:26:10 -07:00 · 2026-03-18 15:23:37 -07:00 · 2026-03-18 15:22:18 -07:00 · 2026-03-18 15:22:18 -07:00 · 2026-03-18 15:22:17 -07:00
85 changed files with 2126 additions and 4362 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -248,13 +248,15 @@ lint.per-file-ignores."docker/wait-for-redis.py" = [
 lint.per-file-ignores."src/documents/models.py" = [
  "SIM115",
 ]
-
+lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
+  "RUF001",
+]
 lint.isort.force-single-line = true

 [tool.codespell]
 write-changes = true
 ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
-skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples/mail/*,src/documents/tests/samples/*,*.po,*.json"
+skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/documents/tests/samples/*,*.po,*.json"

 [tool.pytest]
 minversion = "9.0"
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,5 +1,4 @@
 import datetime
-import hashlib
 import os
 import tempfile
 from enum import StrEnum
@@ -48,15 +47,13 @@ from documents.signals import document_consumption_started
 from documents.signals import document_updated
 from documents.signals.handlers import run_workflows
 from documents.templating.workflows import parse_w_workflow_placeholders
+from documents.utils import compute_checksum
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
-from paperless.parsers import ParserContext
-from paperless.parsers.mail import MailDocumentParser
-from paperless.parsers.remote import RemoteDocumentParser
-from paperless.parsers.tesseract import RasterisedDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
+from paperless_mail.parsers import MailDocumentParser

 LOGGING_NAME: Final[str] = "paperless.consumer"

@@ -71,16 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:

    TODO(stumpylog): Remove me in the future
    """
-    if isinstance(
-        parser,
-        (
-            MailDocumentParser,
-            RasterisedDocumentParser,
-            RemoteDocumentParser,
-            TextDocumentParser,
-            TikaDocumentParser,
-        ),
-    ):
+    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
        parser.__exit__(None, None, None)
    else:
        parser.cleanup()
@@ -226,9 +214,7 @@ class ConsumerPlugin(
        version_doc = Document(
            root_document=root_doc_frozen,
            version_index=next_version_index + 1,
-            checksum=hashlib.md5(
-                file_for_checksum.read_bytes(),
-            ).hexdigest(),
+            checksum=compute_checksum(file_for_checksum),
            content=text or "",
            page_count=page_count,
            mime_type=mime_type,
@@ -461,21 +447,10 @@ class ConsumerPlugin(
            progress_callback=progress_callback,
        )

-        parser_is_new_style = isinstance(
-            document_parser,
-            (
-                MailDocumentParser,
-                RasterisedDocumentParser,
-                RemoteDocumentParser,
-                TextDocumentParser,
-                TikaDocumentParser,
-            ),
-        )
-
        # New-style parsers use __enter__/__exit__ for resource management.
        # _parser_cleanup (below) handles __exit__; call __enter__ here.
        # TODO(stumpylog): Remove me in the future
-        if parser_is_new_style:
+        if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
            document_parser.__enter__()

        self.log.debug(f"Parser: {type(document_parser).__name__}")
@@ -496,12 +471,17 @@ class ConsumerPlugin(
                ConsumerStatusShortMessage.PARSING_DOCUMENT,
            )
            self.log.debug(f"Parsing {self.filename}...")
-
-            # TODO(stumpylog): Remove me in the future when all parsers use new protocol
-            if parser_is_new_style:
-                document_parser.configure(
-                    ParserContext(mailrule_id=self.input_doc.mailrule_id),
+            if (
+                isinstance(document_parser, MailDocumentParser)
+                and self.input_doc.mailrule_id
+            ):
+                document_parser.parse(
+                    self.working_copy,
+                    mime_type,
+                    self.filename,
+                    self.input_doc.mailrule_id,
                )
+            elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
                # TODO(stumpylog): Remove me in the future
                document_parser.parse(self.working_copy, mime_type)
            else:
@@ -514,8 +494,8 @@ class ConsumerPlugin(
                ProgressStatusOptions.WORKING,
                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
            )
-            # TODO(stumpylog): Remove me in the future when all parsers use new protocol
-            if parser_is_new_style:
+            if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+                # TODO(stumpylog): Remove me in the future
                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
            else:
                thumbnail = document_parser.get_thumbnail(
@@ -705,10 +685,9 @@ class ConsumerPlugin(
                            document.archive_path,
                        )

-                        with Path(archive_path).open("rb") as f:
-                            document.archive_checksum = hashlib.md5(
-                                f.read(),
-                            ).hexdigest()
+                        document.archive_checksum = compute_checksum(
+                            Path(archive_path),
+                        )

                # Don't save with the lock active. Saving will cause the file
                # renaming logic to acquire the lock as well.
@@ -849,7 +828,7 @@ class ConsumerPlugin(
            title=title[:127],
            content=text,
            mime_type=mime_type,
-            checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
+            checksum=compute_checksum(file_for_checksum),
            created=create_date,
            modified=create_date,
            page_count=page_count,
@@ -966,10 +945,9 @@ class ConsumerPreflightPlugin(

    def pre_check_duplicate(self) -> None:
        """
-        Using the MD5 of the file, check this exact file doesn't already exist
+        Using the SHA256 of the file, check this exact file doesn't already exist
        """
-        with Path(self.input_doc.original_file).open("rb") as f:
-            checksum = hashlib.md5(f.read()).hexdigest()
+        checksum = compute_checksum(Path(self.input_doc.original_file))
        existing_doc = Document.global_objects.filter(
            Q(checksum=checksum) | Q(archive_checksum=checksum),
        )
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -56,6 +56,7 @@ from documents.models import WorkflowTrigger
 from documents.settings import EXPORTER_ARCHIVE_NAME
 from documents.settings import EXPORTER_FILE_NAME
 from documents.settings import EXPORTER_THUMBNAIL_NAME
+from documents.utils import compute_checksum
 from documents.utils import copy_file_with_basic_stats
 from paperless import version
 from paperless.models import ApplicationConfiguration
@@ -693,7 +694,7 @@ class Command(CryptMixin, PaperlessCommand):
            source_stat = source.stat()
            target_stat = target.stat()
            if self.compare_checksums and source_checksum:
-                target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
+                target_checksum = compute_checksum(target)
                perform_copy = target_checksum != source_checksum
            elif (
                source_stat.st_mtime != target_stat.st_mtime
--- a/src/documents/management/commands/document_thumbnails.py
+++ b/src/documents/management/commands/document_thumbnails.py
@@ -4,11 +4,6 @@ import shutil
 from documents.management.commands.base import PaperlessCommand
 from documents.models import Document
 from documents.parsers import get_parser_class_for_mime_type
-from paperless.parsers.mail import MailDocumentParser
-from paperless.parsers.remote import RemoteDocumentParser
-from paperless.parsers.tesseract import RasterisedDocumentParser
-from paperless.parsers.text import TextDocumentParser
-from paperless.parsers.tika import TikaDocumentParser

 logger = logging.getLogger("paperless.management.thumbnails")

@@ -27,38 +22,16 @@ def _process_document(doc_id: int) -> None:

    parser = parser_class(logging_group=None)

-    parser_is_new_style = isinstance(
-        parser,
-        (
-            MailDocumentParser,
-            RasterisedDocumentParser,
-            RemoteDocumentParser,
-            TextDocumentParser,
-            TikaDocumentParser,
-        ),
-    )
-
-    # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
-    if parser_is_new_style:
-        parser.__enter__()
-
    try:
-        # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
-        if parser_is_new_style:
-            thumb = parser.get_thumbnail(document.source_path, document.mime_type)
-        else:
-            thumb = parser.get_thumbnail(
-                document.source_path,
-                document.mime_type,
-                document.get_public_filename(),
-            )
+        thumb = parser.get_thumbnail(
+            document.source_path,
+            document.mime_type,
+            document.get_public_filename(),
+        )
        shutil.move(thumb, document.thumbnail_path)
    finally:
        # TODO(stumpylog): Cleanup once all parsers are handled
-        if parser_is_new_style:
-            parser.__exit__(None, None, None)
-        else:
-            parser.cleanup()
+        parser.cleanup()


 class Command(PaperlessCommand):
--- a/src/documents/migrations/0016_sha256_checksums.py
+++ b/src/documents/migrations/0016_sha256_checksums.py
@@ -0,0 +1,130 @@
+import hashlib
+import logging
+from pathlib import Path
+
+from django.conf import settings
+from django.db import migrations
+from django.db import models
+
+logger = logging.getLogger("paperless.migrations")
+
+_CHUNK_SIZE = 65536  # 64 KiB — avoids loading entire files into memory
+_BATCH_SIZE = 500  # documents per bulk_update call
+_PROGRESS_INTERVAL = 500  # log a progress line every N documents
+
+
+def _sha256(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as fh:
+        while chunk := fh.read(_CHUNK_SIZE):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def recompute_checksums(apps, schema_editor):
+    """Recompute all document checksums from MD5 to SHA256."""
+    Document = apps.get_model("documents", "Document")
+
+    total = Document.objects.count()
+    if total == 0:
+        return
+
+    logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
+
+    batch: list = []
+    processed = 0
+
+    for doc in Document.objects.only(
+        "pk",
+        "filename",
+        "checksum",
+        "archive_filename",
+        "archive_checksum",
+    ).iterator(chunk_size=_BATCH_SIZE):
+        updated_fields: list[str] = []
+
+        # Reconstruct source path the same way Document.source_path does
+        fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
+        source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
+
+        if source_path.exists():
+            doc.checksum = _sha256(source_path)
+            updated_fields.append("checksum")
+        else:
+            logger.warning(
+                "Document %s: original file %s not found, checksum not updated.",
+                doc.pk,
+                source_path,
+            )
+
+        # Mirror Document.has_archive_version: archive_filename is not None
+        if doc.archive_filename is not None:
+            archive_path = (
+                settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
+            ).resolve()
+            if archive_path.exists():
+                doc.archive_checksum = _sha256(archive_path)
+                updated_fields.append("archive_checksum")
+            else:
+                logger.warning(
+                    "Document %s: archive file %s not found, checksum not updated.",
+                    doc.pk,
+                    archive_path,
+                )
+
+        if updated_fields:
+            batch.append(doc)
+
+        processed += 1
+
+        if len(batch) >= _BATCH_SIZE:
+            Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
+            batch.clear()
+
+        if processed % _PROGRESS_INTERVAL == 0:
+            logger.info(
+                "SHA-256 checksum progress: %d/%d (%d%%)",
+                processed,
+                total,
+                processed * 100 // total,
+            )
+
+    if batch:
+        Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
+
+    logger.info(
+        "SHA-256 checksum recomputation complete: %d document(s) processed.",
+        total,
+    )
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("documents", "0015_document_version_index_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="document",
+            name="checksum",
+            field=models.CharField(
+                editable=False,
+                help_text="The checksum of the original document.",
+                max_length=64,
+                verbose_name="checksum",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="document",
+            name="archive_checksum",
+            field=models.CharField(
+                blank=True,
+                editable=False,
+                help_text="The checksum of the archived document.",
+                max_length=64,
+                null=True,
+                verbose_name="archive checksum",
+            ),
+        ),
+        migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner):  # type: ignore[django-manager-

    checksum = models.CharField(
        _("checksum"),
-        max_length=32,
+        max_length=64,
        editable=False,
        help_text=_("The checksum of the original document."),
    )

    archive_checksum = models.CharField(
        _("archive checksum"),
-        max_length=32,
+        max_length=64,
        editable=False,
        blank=True,
        null=True,
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@@ -11,7 +11,6 @@ is an identity function that adds no overhead.

 from __future__ import annotations

-import hashlib
 import logging
 import uuid
 from collections import defaultdict
@@ -30,6 +29,7 @@ from django.utils import timezone

 from documents.models import Document
 from documents.models import PaperlessTask
+from documents.utils import compute_checksum
 from paperless.config import GeneralConfig

 logger = logging.getLogger("paperless.sanity_checker")
@@ -218,7 +218,7 @@ def _check_original(

    present_files.discard(source_path)
    try:
-        checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
+        checksum = compute_checksum(source_path)
    except OSError as e:
        messages.error(doc.pk, f"Cannot read original file of document: {e}")
    else:
@@ -255,7 +255,7 @@ def _check_archive(

        present_files.discard(archive_path)
        try:
-            checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
+            checksum = compute_checksum(archive_path)
        except OSError as e:
            messages.error(
                doc.pk,
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -1,5 +1,4 @@
 import datetime
-import hashlib
 import logging
 import shutil
 import uuid
@@ -63,14 +62,9 @@ from documents.signals import document_updated
 from documents.signals.handlers import cleanup_document_deletion
 from documents.signals.handlers import run_workflows
 from documents.signals.handlers import send_websocket_document_updated
+from documents.utils import compute_checksum
 from documents.workflows.utils import get_workflows_for_trigger
 from paperless.config import AIConfig
-from paperless.parsers import ParserContext
-from paperless.parsers.mail import MailDocumentParser
-from paperless.parsers.remote import RemoteDocumentParser
-from paperless.parsers.tesseract import RasterisedDocumentParser
-from paperless.parsers.text import TextDocumentParser
-from paperless.parsers.tika import TikaDocumentParser
 from paperless_ai.indexing import llm_index_add_or_update_document
 from paperless_ai.indexing import llm_index_remove_document
 from paperless_ai.indexing import update_llm_index
@@ -310,9 +304,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:

    mime_type = document.mime_type

-    parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
-        mime_type,
-    )
+    parser_class: type[DocumentParser] = get_parser_class_for_mime_type(mime_type)

    if not parser_class:
        logger.error(
@@ -323,48 +315,19 @@ def update_document_content_maybe_archive_file(document_id) -> None:

    parser: DocumentParser = parser_class(logging_group=uuid.uuid4())

-    parser_is_new_style = isinstance(
-        parser,
-        (
-            MailDocumentParser,
-            RasterisedDocumentParser,
-            RemoteDocumentParser,
-            TextDocumentParser,
-            TikaDocumentParser,
-        ),
-    )
-
-    # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
-    if parser_is_new_style:
-        parser.__enter__()
-
    try:
-        # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
-        if parser_is_new_style:
-            parser.configure(ParserContext())
-            parser.parse(document.source_path, mime_type)
-        else:
-            parser.parse(
-                document.source_path,
-                mime_type,
-                document.get_public_filename(),
-            )
+        parser.parse(document.source_path, mime_type, document.get_public_filename())

-        # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
-        if parser_is_new_style:
-            thumbnail = parser.get_thumbnail(document.source_path, mime_type)
-        else:
-            thumbnail = parser.get_thumbnail(
-                document.source_path,
-                mime_type,
-                document.get_public_filename(),
-            )
+        thumbnail = parser.get_thumbnail(
+            document.source_path,
+            mime_type,
+            document.get_public_filename(),
+        )

        with transaction.atomic():
            oldDocument = Document.objects.get(pk=document.pk)
            if parser.get_archive_path():
-                with Path(parser.get_archive_path()).open("rb") as f:
-                    checksum = hashlib.md5(f.read()).hexdigest()
+                checksum = compute_checksum(Path(parser.get_archive_path()))
                # I'm going to save first so that in case the file move
                # fails, the database is rolled back.
                # We also don't use save() since that triggers the filehandling
@@ -439,20 +402,8 @@ def update_document_content_maybe_archive_file(document_id) -> None:
            f"Error while parsing document {document} (ID: {document_id})",
        )
    finally:
-        # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
-        if isinstance(
-            parser,
-            (
-                MailDocumentParser,
-                RasterisedDocumentParser,
-                RemoteDocumentParser,
-                TextDocumentParser,
-                TikaDocumentParser,
-            ),
-        ):
-            parser.__exit__(None, None, None)
-        else:
-            parser.cleanup()
+        # TODO(stumpylog): Cleanup once all parsers are handled
+        parser.cleanup()


@shared_task
--- a/src/documents/tests/conftest.py
+++ b/src/documents/tests/conftest.py
@@ -82,8 +82,8 @@ def sample_doc(

    return DocumentFactory(
        title="test",
-        checksum="42995833e01aea9b3edee44bbfdd7ce1",
-        archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
+        checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+        archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
        content="test content",
        pk=1,
        filename="0000001.pdf",
--- a/src/documents/tests/factories.py
+++ b/src/documents/tests/factories.py
@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
        model = Document

    title = factory.Faker("sentence", nb_words=4)
-    checksum = factory.Faker("md5")
+    checksum = factory.Faker("sha256")
    content = factory.Faker("paragraph")
    correspondent = None
    document_type = None
--- a/src/documents/tests/test_api_status.py
+++ b/src/documents/tests/test_api_status.py
@@ -101,17 +101,13 @@ class TestSystemStatus(APITestCase):
            - The response contains the correct install type
        """
        self.client.force_login(self.user)
-        with mock.patch.dict(os.environ, {"PNGX_CONTAINERIZED": "1"}, clear=False):
-            response = self.client.get(self.ENDPOINT)
-            self.assertEqual(response.status_code, status.HTTP_200_OK)
-            self.assertEqual(response.data["install_type"], "docker")
-        with mock.patch.dict(
-            os.environ,
-            {"PNGX_CONTAINERIZED": "1", "KUBERNETES_SERVICE_HOST": "http://localhost"},
-            clear=False,
-        ):
-            response = self.client.get(self.ENDPOINT)
-            self.assertEqual(response.data["install_type"], "kubernetes")
+        os.environ["PNGX_CONTAINERIZED"] = "1"
+        response = self.client.get(self.ENDPOINT)
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["install_type"], "docker")
+        os.environ["KUBERNETES_SERVICE_HOST"] = "http://localhost"
+        response = self.client.get(self.ENDPOINT)
+        self.assertEqual(response.data["install_type"], "kubernetes")

    @mock.patch("redis.Redis.execute_command")
    def test_system_status_redis_ping(self, mock_ping) -> None:
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -36,6 +36,7 @@ from documents.tests.utils import DummyProgressManager
 from documents.tests.utils import FileSystemAssertsMixin
 from documents.tests.utils import GetConsumerMixin
 from paperless_mail.models import MailRule
+from paperless_mail.parsers import MailDocumentParser


 class _BaseTestParser(DocumentParser):
@@ -244,8 +245,14 @@ class TestConsumer(

        self.assertIsFile(document.archive_path)

-        self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
-        self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
+        self.assertEqual(
+            document.checksum,
+            "1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+        )
+        self.assertEqual(
+            document.archive_checksum,
+            "706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
+        )

        self.assertIsNotFile(filename)

@@ -1090,7 +1097,7 @@ class TestConsumer(
            self.assertEqual(command[1], "--replace-input")

    @mock.patch("paperless_mail.models.MailRule.objects.get")
-    @mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
+    @mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def test_mail_parser_receives_mailrule(
        self,
@@ -1106,13 +1113,11 @@ class TestConsumer(
        THEN:
            - The mail parser should receive the mail rule
        """
-        from paperless_mail.signals import get_parser as mail_get_parser
-
        mock_consumer_declaration_send.return_value = [
            (
                None,
                {
-                    "parser": mail_get_parser,
+                    "parser": MailDocumentParser,
                    "mime_types": {"message/rfc822": ".eml"},
                    "weight": 0,
                },
@@ -1124,10 +1129,9 @@ class TestConsumer(
        with self.get_consumer(
            filepath=(
                Path(__file__).parent.parent.parent
-                / Path("paperless")
+                / Path("paperless_mail")
                / Path("tests")
                / Path("samples")
-                / Path("mail")
            ).resolve()
            / "html.eml",
            source=DocumentSource.MailFetch,
@@ -1138,10 +1142,12 @@ class TestConsumer(
                ConsumerError,
            ):
                consumer.run()
-            mock_mail_parser_parse.assert_called_once_with(
-                consumer.working_copy,
-                "message/rfc822",
-            )
+                mock_mail_parser_parse.assert_called_once_with(
+                    consumer.working_copy,
+                    "message/rfc822",
+                    file_name="sample.pdf",
+                    mailrule=mock_mailrule_get.return_value,
+                )


@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@@ -63,8 +63,8 @@ class TestExportImport(

        self.d1 = Document.objects.create(
            content="Content",
-            checksum="42995833e01aea9b3edee44bbfdd7ce1",
-            archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
+            checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+            archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
            title="wow1",
            filename="0000001.pdf",
            mime_type="application/pdf",
@@ -72,21 +72,21 @@ class TestExportImport(
        )
        self.d2 = Document.objects.create(
            content="Content",
-            checksum="9c9691e51741c1f4f41a20896af31770",
+            checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
            title="wow2",
            filename="0000002.pdf",
            mime_type="application/pdf",
        )
        self.d3 = Document.objects.create(
            content="Content",
-            checksum="d38d7ed02e988e072caf924e0f3fcb76",
+            checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
            title="wow2",
            filename="0000003.pdf",
            mime_type="application/pdf",
        )
        self.d4 = Document.objects.create(
            content="Content",
-            checksum="82186aaa94f0b98697d704b90fd1c072",
+            checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
            title="wow_dec",
            filename="0000004.pdf",
            mime_type="application/pdf",
@@ -239,7 +239,7 @@ class TestExportImport(
                )

                with Path(fname).open("rb") as f:
-                    checksum = hashlib.md5(f.read()).hexdigest()
+                    checksum = hashlib.sha256(f.read()).hexdigest()
                self.assertEqual(checksum, element["fields"]["checksum"])

                # Generated field "content_length" should not be exported,
@@ -253,7 +253,7 @@ class TestExportImport(
                    self.assertIsFile(fname)

                    with Path(fname).open("rb") as f:
-                        checksum = hashlib.md5(f.read()).hexdigest()
+                        checksum = hashlib.sha256(f.read()).hexdigest()
                    self.assertEqual(checksum, element["fields"]["archive_checksum"])

            elif element["model"] == "documents.note":
--- a/src/documents/tests/test_management_importer.py
+++ b/src/documents/tests/test_management_importer.py
@@ -277,8 +277,8 @@ class TestCommandImport(

        Document.objects.create(
            content="Content",
-            checksum="42995833e01aea9b3edee44bbfdd7ce1",
-            archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
+            checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+            archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
            title="wow1",
            filename="0000001.pdf",
            mime_type="application/pdf",
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -9,9 +9,9 @@ from documents.parsers import get_default_file_extension
 from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import get_supported_file_extensions
 from documents.parsers import is_file_ext_supported
-from paperless.parsers.tesseract import RasterisedDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
+from paperless_tesseract.parsers import RasterisedDocumentParser


 class TestParserDiscovery(TestCase):
--- a/src/documents/utils.py
+++ b/src/documents/utils.py
@@ -1,3 +1,4 @@
+import hashlib
 import logging
 import shutil
 from os import utime
@@ -128,3 +129,28 @@ def get_boolean(boolstr: str) -> bool:
    Return a boolean value from a string representation.
    """
    return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
+
+
+def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
+    """
+    Compute the SHA-256 checksum of a file.
+
+    Reads the file in chunks to avoid loading the entire file into memory.
+
+    Args:
+        path (Path): Path to the file to hash.
+        chunk_size (int, optional): Number of bytes to read per chunk.
+            Defaults to 65536.
+
+    Returns:
+        str: Hexadecimal SHA-256 digest of the file contents.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        OSError: If the file cannot be read.
+    """
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        while chunk := f.read(chunk_size):
+            h.update(chunk)
+    return h.hexdigest()
--- a/src/paperless/parsers/init.py
+++ b/src/paperless/parsers/init.py
@@ -35,7 +35,6 @@ Usage example (third-party parser)::

 from __future__ import annotations

-from dataclasses import dataclass
 from typing import TYPE_CHECKING
 from typing import Protocol
 from typing import Self
@@ -49,7 +48,6 @@ if TYPE_CHECKING:

 __all__ = [
    "MetadataEntry",
-    "ParserContext",
    "ParserProtocol",
 ]

@@ -75,44 +73,6 @@ class MetadataEntry(TypedDict):
    """String representation of the field value."""


-@dataclass(frozen=True, slots=True)
-class ParserContext:
-    """Immutable context passed to a parser before parse().
-
-    The consumer assembles this from the ingestion event and Django
-    settings, then calls ``parser.configure(context)`` before
-    ``parser.parse()``.  Parsers read only the fields relevant to them;
-    unneeded fields are ignored.
-
-    ``frozen=True`` prevents accidental mutation after the consumer
-    hands the context off.  ``slots=True`` keeps instances lightweight.
-
-    Fields
-    ------
-    mailrule_id : int | None
-        Primary key of the ``MailRule`` that triggered this ingestion,
-        or ``None`` when the document did not arrive via a mail rule.
-        Used by ``MailDocumentParser`` to select the PDF layout.
-
-    Notes
-    -----
-    Future fields (not yet implemented):
-
-    * ``output_type`` — PDF/A variant for archive generation
-      (replaces ``settings.OCR_OUTPUT_TYPE`` reads inside parsers).
-    * ``ocr_mode`` — skip-text, redo, force, etc.
-      (replaces ``settings.OCR_MODE`` reads inside parsers).
-    * ``ocr_language`` — Tesseract language string.
-      (replaces ``settings.OCR_LANGUAGE`` reads inside parsers).
-
-    When those fields are added the consumer will read from Django
-    settings once and populate them here, decoupling parsers from
-    ``settings.*`` entirely.
-    """
-
-    mailrule_id: int | None = None
-
-
@runtime_checkable
 class ParserProtocol(Protocol):
    """Structural contract for all Paperless-ngx document parsers.
@@ -231,21 +191,6 @@ class ParserProtocol(Protocol):
    # Core parsing interface
    # ------------------------------------------------------------------

-    def configure(self, context: ParserContext) -> None:
-        """Apply source context before parse().
-
-        Called by the consumer after instantiation and before parse().
-        The default implementation is a no-op; parsers override only the
-        fields they need.
-
-        Parameters
-        ----------
-        context:
-            Immutable context assembled by the consumer for this
-            specific ingestion event.
-        """
-        ...
-
    def parse(
        self,
        document_path: Path,
--- a/src/paperless/parsers/mail.py
+++ b/src/paperless/parsers/mail.py
@@ -1,834 +0,0 @@
-"""
-Built-in mail document parser.
-
-Handles message/rfc822 (EML) MIME type by:
- Parsing the email using imap_tools
- Generating a PDF via Gotenberg (for display and archive)
- Extracting text via Tika for HTML content
- Extracting metadata from email headers
-
-The parser always produces a PDF because EML files cannot be rendered
-natively in a browser (requires_pdf_rendition=True).
-"""
-
-from __future__ import annotations
-
-import logging
-import re
-import shutil
-import tempfile
-from html import escape
-from pathlib import Path
-from typing import TYPE_CHECKING
-from typing import Self
-
-from bleach import clean
-from bleach import linkify
-from django.conf import settings
-from django.utils import timezone
-from django.utils.timezone import is_naive
-from django.utils.timezone import make_aware
-from gotenberg_client import GotenbergClient
-from gotenberg_client.constants import A4
-from gotenberg_client.options import Measurement
-from gotenberg_client.options import MeasurementUnitType
-from gotenberg_client.options import PageMarginsType
-from gotenberg_client.options import PdfAFormat
-from humanize import naturalsize
-from imap_tools import MailAttachment
-from imap_tools import MailMessage
-from tika_client import TikaClient
-
-from documents.parsers import ParseError
-from documents.parsers import make_thumbnail_from_pdf
-from paperless.models import OutputTypeChoices
-from paperless.version import __full_version_str__
-from paperless_mail.models import MailRule
-
-if TYPE_CHECKING:
-    import datetime
-    from types import TracebackType
-
-    from paperless.parsers import MetadataEntry
-    from paperless.parsers import ParserContext
-
-logger = logging.getLogger("paperless.parsing.mail")
-
-_SUPPORTED_MIME_TYPES: dict[str, str] = {
-    "message/rfc822": ".eml",
-}
-
-
-class MailDocumentParser:
-    """Parse .eml email files for Paperless-ngx.
-
-    Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
-    and sends the HTML part to a Tika server for text extraction.  Because
-    EML files cannot be rendered natively in a browser, the parser always
-    produces a PDF rendition (requires_pdf_rendition=True).
-
-    Pass a ``ParserContext`` to ``configure()`` before ``parse()`` to
-    apply mail-rule-specific PDF layout options:
-
-        parser.configure(ParserContext(mailrule_id=rule.pk))
-        parser.parse(path, mime_type)
-
-    Class attributes
-    ----------------
-    name : str
-        Human-readable parser name.
-    version : str
-        Semantic version string, kept in sync with Paperless-ngx releases.
-    author : str
-        Maintainer name.
-    url : str
-        Issue tracker / source URL.
-    """
-
-    name: str = "Paperless-ngx Mail Parser"
-    version: str = __full_version_str__
-    author: str = "Paperless-ngx Contributors"
-    url: str = "https://github.com/paperless-ngx/paperless-ngx"
-
-    # ------------------------------------------------------------------
-    # Class methods
-    # ------------------------------------------------------------------
-
-    @classmethod
-    def supported_mime_types(cls) -> dict[str, str]:
-        """Return the MIME types this parser handles.
-
-        Returns
-        -------
-        dict[str, str]
-            Mapping of MIME type to preferred file extension.
-        """
-        return _SUPPORTED_MIME_TYPES
-
-    @classmethod
-    def score(
-        cls,
-        mime_type: str,
-        filename: str,
-        path: Path | None = None,
-    ) -> int | None:
-        """Return the priority score for handling this file.
-
-        Parameters
-        ----------
-        mime_type:
-            Detected MIME type of the file.
-        filename:
-            Original filename including extension.
-        path:
-            Optional filesystem path. Not inspected by this parser.
-
-        Returns
-        -------
-        int | None
-            10 if the MIME type is supported, otherwise None.
-        """
-        if mime_type in _SUPPORTED_MIME_TYPES:
-            return 10
-        return None
-
-    # ------------------------------------------------------------------
-    # Properties
-    # ------------------------------------------------------------------
-
-    @property
-    def can_produce_archive(self) -> bool:
-        """Whether this parser can produce a searchable PDF archive copy.
-
-        Returns
-        -------
-        bool
-            Always False — the mail parser produces a display PDF
-            (requires_pdf_rendition=True), not an optional OCR archive.
-        """
-        return False
-
-    @property
-    def requires_pdf_rendition(self) -> bool:
-        """Whether the parser must produce a PDF for the frontend to display.
-
-        Returns
-        -------
-        bool
-            Always True — EML files cannot be rendered natively in a browser,
-            so a PDF conversion is always required for display.
-        """
-        return True
-
-    # ------------------------------------------------------------------
-    # Lifecycle
-    # ------------------------------------------------------------------
-
-    def __init__(self, logging_group: object = None) -> None:
-        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
-        self._tempdir = Path(
-            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
-        )
-        self._text: str | None = None
-        self._date: datetime.datetime | None = None
-        self._archive_path: Path | None = None
-        self._mailrule_id: int | None = None
-
-    def __enter__(self) -> Self:
-        return self
-
-    def __exit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc_val: BaseException | None,
-        exc_tb: TracebackType | None,
-    ) -> None:
-        logger.debug("Cleaning up temporary directory %s", self._tempdir)
-        shutil.rmtree(self._tempdir, ignore_errors=True)
-
-    # ------------------------------------------------------------------
-    # Core parsing interface
-    # ------------------------------------------------------------------
-
-    def configure(self, context: ParserContext) -> None:
-        self._mailrule_id = context.mailrule_id
-
-    def parse(
-        self,
-        document_path: Path,
-        mime_type: str,
-        *,
-        produce_archive: bool = True,
-    ) -> None:
-        """Parse the given .eml into formatted text and a PDF archive.
-
-        Call ``configure(ParserContext(mailrule_id=...))`` before this method
-        to apply mail-rule-specific PDF layout options.  The ``produce_archive``
-        flag is accepted for protocol compatibility but is always honoured —
-        the mail parser always produces a PDF since EML files cannot be
-        displayed natively.
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the .eml file.
-        mime_type:
-            Detected MIME type of the document (should be "message/rfc822").
-        produce_archive:
-            Accepted for protocol compatibility. The PDF rendition is always
-            produced since EML files cannot be displayed natively in a browser.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If the file cannot be parsed or PDF generation fails.
-        """
-
-        def strip_text(text: str) -> str:
-            """Reduces the spacing of the given text string."""
-            text = re.sub(r"\s+", " ", text)
-            text = re.sub(r"(\n *)+", "\n", text)
-            return text.strip()
-
-        def build_formatted_text(mail_message: MailMessage) -> str:
-            """Constructs a formatted string based on the given email."""
-            fmt_text = f"Subject: {mail_message.subject}\n\n"
-            fmt_text += f"From: {mail_message.from_values.full if mail_message.from_values else ''}\n\n"
-            to_list = [address.full for address in mail_message.to_values]
-            fmt_text += f"To: {', '.join(to_list)}\n\n"
-            if mail_message.cc_values:
-                fmt_text += (
-                    f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
-                )
-            if mail_message.bcc_values:
-                fmt_text += (
-                    f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
-                )
-            if mail_message.attachments:
-                att = []
-                for a in mail.attachments:
-                    attachment_size = naturalsize(a.size, binary=True, format="%.2f")
-                    att.append(
-                        f"{a.filename} ({attachment_size})",
-                    )
-                fmt_text += f"Attachments: {', '.join(att)}\n\n"
-
-            if mail.html:
-                fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
-
-            fmt_text += f"\n\n{strip_text(mail.text)}"
-
-            return fmt_text
-
-        logger.debug("Parsing file %s into an email", document_path.name)
-        mail = self.parse_file_to_message(document_path)
-
-        logger.debug("Building formatted text from email")
-        self._text = build_formatted_text(mail)
-
-        if is_naive(mail.date):
-            self._date = make_aware(mail.date)
-        else:
-            self._date = mail.date
-
-        logger.debug("Creating a PDF from the email")
-        if self._mailrule_id:
-            rule = MailRule.objects.get(pk=self._mailrule_id)
-            self._archive_path = self.generate_pdf(
-                mail,
-                MailRule.PdfLayout(rule.pdf_layout),
-            )
-        else:
-            self._archive_path = self.generate_pdf(mail)
-
-    # ------------------------------------------------------------------
-    # Result accessors
-    # ------------------------------------------------------------------
-
-    def get_text(self) -> str | None:
-        """Return the plain-text content extracted during parse.
-
-        Returns
-        -------
-        str | None
-            Extracted text, or None if parse has not been called yet.
-        """
-        return self._text
-
-    def get_date(self) -> datetime.datetime | None:
-        """Return the document date detected during parse.
-
-        Returns
-        -------
-        datetime.datetime | None
-            Date from the email headers, or None if not detected.
-        """
-        return self._date
-
-    def get_archive_path(self) -> Path | None:
-        """Return the path to the generated archive PDF, or None.
-
-        Returns
-        -------
-        Path | None
-            Path to the PDF produced by Gotenberg, or None if parse has not
-            been called yet.
-        """
-        return self._archive_path
-
-    # ------------------------------------------------------------------
-    # Thumbnail and metadata
-    # ------------------------------------------------------------------
-
-    def get_thumbnail(
-        self,
-        document_path: Path,
-        mime_type: str,
-        file_name: str | None = None,
-    ) -> Path:
-        """Generate a thumbnail from the PDF rendition of the email.
-
-        Converts the document to PDF first if not already done.
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the source document.
-        mime_type:
-            Detected MIME type of the document.
-        file_name:
-            Kept for backward compatibility; not used.
-
-        Returns
-        -------
-        Path
-            Path to the generated WebP thumbnail inside the temporary directory.
-        """
-        if not self._archive_path:
-            self._archive_path = self.generate_pdf(
-                self.parse_file_to_message(document_path),
-            )
-
-        return make_thumbnail_from_pdf(
-            self._archive_path,
-            self._tempdir,
-        )
-
-    def get_page_count(
-        self,
-        document_path: Path,
-        mime_type: str,
-    ) -> int | None:
-        """Return the number of pages in the document.
-
-        Counts pages in the archive PDF produced by a preceding parse()
-        call.  Returns ``None`` if parse() has not been called yet or if
-        no archive was produced.
-
-        Returns
-        -------
-        int | None
-            Page count of the archive PDF, or ``None``.
-        """
-        if self._archive_path is not None:
-            from paperless.parsers.utils import get_page_count_for_pdf
-
-            return get_page_count_for_pdf(self._archive_path, log=logger)
-        return None
-
-    def extract_metadata(
-        self,
-        document_path: Path,
-        mime_type: str,
-    ) -> list[MetadataEntry]:
-        """Extract metadata from the email headers.
-
-        Returns email headers as metadata entries with prefix "header",
-        plus summary entries for attachments and date.
-
-        Returns
-        -------
-        list[MetadataEntry]
-            Sorted list of metadata entries, or ``[]`` on parse failure.
-        """
-        result: list[MetadataEntry] = []
-
-        try:
-            mail = self.parse_file_to_message(document_path)
-        except ParseError as e:
-            logger.warning(
-                "Error while fetching document metadata for %s: %s",
-                document_path,
-                e,
-            )
-            return result
-
-        for key, header_values in mail.headers.items():
-            value = ", ".join(header_values)
-            try:
-                value.encode("utf-8")
-            except UnicodeEncodeError as e:  # pragma: no cover
-                logger.debug("Skipping header %s: %s", key, e)
-                continue
-
-            result.append(
-                {
-                    "namespace": "",
-                    "prefix": "header",
-                    "key": key,
-                    "value": value,
-                },
-            )
-
-        result.append(
-            {
-                "namespace": "",
-                "prefix": "",
-                "key": "attachments",
-                "value": ", ".join(
-                    f"{attachment.filename}"
-                    f"({naturalsize(attachment.size, binary=True, format='%.2f')})"
-                    for attachment in mail.attachments
-                ),
-            },
-        )
-
-        result.append(
-            {
-                "namespace": "",
-                "prefix": "",
-                "key": "date",
-                "value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
-            },
-        )
-
-        result.sort(key=lambda item: (item["prefix"], item["key"]))
-        return result
-
-    # ------------------------------------------------------------------
-    # Email-specific methods
-    # ------------------------------------------------------------------
-
-    def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
-        """Convert the OCR output type setting to a Gotenberg PdfAFormat."""
-        if settings.OCR_OUTPUT_TYPE in {
-            OutputTypeChoices.PDF_A,
-            OutputTypeChoices.PDF_A2,
-        }:
-            return PdfAFormat.A2b
-        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:  # pragma: no cover
-            logger.warning(
-                "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
-            )
-            return PdfAFormat.A2b
-        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:  # pragma: no cover
-            return PdfAFormat.A3b
-        return None
-
-    @staticmethod
-    def parse_file_to_message(filepath: Path) -> MailMessage:
-        """Parse the given .eml file into a MailMessage object.
-
-        Parameters
-        ----------
-        filepath:
-            Path to the .eml file.
-
-        Returns
-        -------
-        MailMessage
-            Parsed mail message.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If the file cannot be parsed or is missing required fields.
-        """
-        try:
-            with filepath.open("rb") as eml:
-                parsed = MailMessage.from_bytes(eml.read())
-                if parsed.from_values is None:
-                    raise ParseError(
-                        f"Could not parse {filepath}: Missing 'from'",
-                    )
-        except Exception as err:
-            raise ParseError(
-                f"Could not parse {filepath}: {err}",
-            ) from err
-
-        return parsed
-
-    def tika_parse(self, html: str) -> str:
-        """Send HTML content to the Tika server for text extraction.
-
-        Parameters
-        ----------
-        html:
-            HTML string to parse.
-
-        Returns
-        -------
-        str
-            Extracted plain text.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If the Tika server cannot be reached or returns an error.
-        """
-        logger.info("Sending content to Tika server")
-
-        try:
-            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
-                parsed = client.tika.as_text.from_buffer(html, "text/html")
-
-                if parsed.content is not None:
-                    return parsed.content.strip()
-                return ""
-        except Exception as err:
-            raise ParseError(
-                f"Could not parse content with tika server at "
-                f"{settings.TIKA_ENDPOINT}: {err}",
-            ) from err
-
-    def generate_pdf(
-        self,
-        mail_message: MailMessage,
-        pdf_layout: MailRule.PdfLayout | None = None,
-    ) -> Path:
-        """Generate a PDF from the email message.
-
-        Creates separate PDFs for the email body and HTML content, then
-        merges them according to the requested layout.
-
-        Parameters
-        ----------
-        mail_message:
-            Parsed email message.
-        pdf_layout:
-            Layout option for the PDF. Falls back to the
-            EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
-
-        Returns
-        -------
-        Path
-            Path to the generated PDF inside the temporary directory.
-        """
-        archive_path = Path(self._tempdir) / "merged.pdf"
-
-        mail_pdf_file = self.generate_pdf_from_mail(mail_message)
-
-        if pdf_layout is None:
-            pdf_layout = MailRule.PdfLayout(settings.EMAIL_PARSE_DEFAULT_LAYOUT)
-
-        # If no HTML content, create the PDF from the message.
-        # Otherwise, create 2 PDFs and merge them with Gotenberg.
-        if not mail_message.html:
-            archive_path.write_bytes(mail_pdf_file.read_bytes())
-        else:
-            pdf_of_html_content = self.generate_pdf_from_html(
-                mail_message.html,
-                mail_message.attachments,
-            )
-
-            logger.debug("Merging email text and HTML content into single PDF")
-
-            with (
-                GotenbergClient(
-                    host=settings.TIKA_GOTENBERG_ENDPOINT,
-                    timeout=settings.CELERY_TASK_TIME_LIMIT,
-                ) as client,
-                client.merge.merge() as route,
-            ):
-                # Configure requested PDF/A formatting, if any
-                pdf_a_format = self._settings_to_gotenberg_pdfa()
-                if pdf_a_format is not None:
-                    route.pdf_format(pdf_a_format)
-
-                match pdf_layout:
-                    case MailRule.PdfLayout.HTML_TEXT:
-                        route.merge([pdf_of_html_content, mail_pdf_file])
-                    case MailRule.PdfLayout.HTML_ONLY:
-                        route.merge([pdf_of_html_content])
-                    case MailRule.PdfLayout.TEXT_ONLY:
-                        route.merge([mail_pdf_file])
-                    case MailRule.PdfLayout.TEXT_HTML | _:
-                        route.merge([mail_pdf_file, pdf_of_html_content])
-
-                try:
-                    response = route.run()
-                    archive_path.write_bytes(response.content)
-                except Exception as err:
-                    raise ParseError(
-                        f"Error while merging email HTML into PDF: {err}",
-                    ) from err
-
-        return archive_path
-
-    def mail_to_html(self, mail: MailMessage) -> Path:
-        """Convert the given email into an HTML file using a template.
-
-        Parameters
-        ----------
-        mail:
-            Parsed mail message.
-
-        Returns
-        -------
-        Path
-            Path to the rendered HTML file inside the temporary directory.
-        """
-
-        def clean_html(text: str) -> str:
-            """Attempt to clean, escape, and linkify the given HTML string."""
-            if isinstance(text, list):
-                text = "\n".join([str(e) for e in text])
-            if not isinstance(text, str):
-                text = str(text)
-            text = escape(text)
-            text = clean(text)
-            text = linkify(text, parse_email=True)
-            text = text.replace("\n", "<br>")
-            return text
-
-        data = {}
-
-        data["subject"] = clean_html(mail.subject)
-        if data["subject"]:
-            data["subject_label"] = "Subject"
-        data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
-        if data["from"]:
-            data["from_label"] = "From"
-        data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
-        if data["to"]:
-            data["to_label"] = "To"
-        data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
-        if data["cc"]:
-            data["cc_label"] = "CC"
-        data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
-        if data["bcc"]:
-            data["bcc_label"] = "BCC"
-
-        att = []
-        for a in mail.attachments:
-            att.append(
-                f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
-            )
-        data["attachments"] = clean_html(", ".join(att))
-        if data["attachments"]:
-            data["attachments_label"] = "Attachments"
-
-        data["date"] = clean_html(
-            timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
-        )
-        data["content"] = clean_html(mail.text.strip())
-
-        from django.template.loader import render_to_string
-
-        html_file = Path(self._tempdir) / "email_as_html.html"
-        html_file.write_text(render_to_string("email_msg_template.html", context=data))
-
-        return html_file
-
-    def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
-        """Create a PDF from the email body using an HTML template and Gotenberg.
-
-        Parameters
-        ----------
-        mail:
-            Parsed mail message.
-
-        Returns
-        -------
-        Path
-            Path to the generated PDF inside the temporary directory.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If Gotenberg returns an error.
-        """
-        logger.info("Converting mail to PDF")
-
-        css_file = (
-            Path(__file__).parent.parent.parent
-            / "paperless_mail"
-            / "templates"
-            / "output.css"
-        )
-        email_html_file = self.mail_to_html(mail)
-
-        with (
-            GotenbergClient(
-                host=settings.TIKA_GOTENBERG_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client,
-            client.chromium.html_to_pdf() as route,
-        ):
-            # Configure requested PDF/A formatting, if any
-            pdf_a_format = self._settings_to_gotenberg_pdfa()
-            if pdf_a_format is not None:
-                route.pdf_format(pdf_a_format)
-
-            try:
-                response = (
-                    route.index(email_html_file)
-                    .resource(css_file)
-                    .margins(
-                        PageMarginsType(
-                            top=Measurement(0.1, MeasurementUnitType.Inches),
-                            bottom=Measurement(0.1, MeasurementUnitType.Inches),
-                            left=Measurement(0.1, MeasurementUnitType.Inches),
-                            right=Measurement(0.1, MeasurementUnitType.Inches),
-                        ),
-                    )
-                    .size(A4)
-                    .scale(1.0)
-                    .run()
-                )
-            except Exception as err:
-                raise ParseError(
-                    f"Error while converting email to PDF: {err}",
-                ) from err
-
-        email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
-        email_as_pdf_file.write_bytes(response.content)
-
-        return email_as_pdf_file
-
-    def generate_pdf_from_html(
-        self,
-        orig_html: str,
-        attachments: list[MailAttachment],
-    ) -> Path:
-        """Generate a PDF from the HTML content of the email.
-
-        Parameters
-        ----------
-        orig_html:
-            Raw HTML string from the email body.
-        attachments:
-            List of email attachments (used as inline resources).
-
-        Returns
-        -------
-        Path
-            Path to the generated PDF inside the temporary directory.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If Gotenberg returns an error.
-        """
-
-        def clean_html_script(text: str) -> str:
-            compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
-            text = compiled_open.sub("<div hidden ", text)
-
-            compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
-            text = compiled_close.sub("</div", text)
-            return text
-
-        logger.info("Converting message html to PDF")
-
-        tempdir = Path(self._tempdir)
-
-        html_clean = clean_html_script(orig_html)
-        html_clean_file = tempdir / "index.html"
-        html_clean_file.write_text(html_clean)
-
-        with (
-            GotenbergClient(
-                host=settings.TIKA_GOTENBERG_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client,
-            client.chromium.html_to_pdf() as route,
-        ):
-            # Configure requested PDF/A formatting, if any
-            pdf_a_format = self._settings_to_gotenberg_pdfa()
-            if pdf_a_format is not None:
-                route.pdf_format(pdf_a_format)
-
-            # Add attachments as resources, cleaning the filename and replacing
-            # it in the index file for inclusion
-            for attachment in attachments:
-                # Clean the attachment name to be valid
-                name_cid = f"cid:{attachment.content_id}"
-                name_clean = "".join(e for e in name_cid if e.isalnum())
-
-                # Write attachment payload to a temp file
-                temp_file = tempdir / name_clean
-                temp_file.write_bytes(attachment.payload)
-
-                route.resource(temp_file)
-
-                # Replace as needed the name with the clean name
-                html_clean = html_clean.replace(name_cid, name_clean)
-
-            # Now store the cleaned up HTML version
-            html_clean_file = tempdir / "index.html"
-            html_clean_file.write_text(html_clean)
-            # This is our index file, the main page basically
-            route.index(html_clean_file)
-
-            # Set page size, margins
-            route.margins(
-                PageMarginsType(
-                    top=Measurement(0.1, MeasurementUnitType.Inches),
-                    bottom=Measurement(0.1, MeasurementUnitType.Inches),
-                    left=Measurement(0.1, MeasurementUnitType.Inches),
-                    right=Measurement(0.1, MeasurementUnitType.Inches),
-                ),
-            ).size(A4).scale(1.0)
-
-            try:
-                response = route.run()
-
-            except Exception as err:
-                raise ParseError(
-                    f"Error while converting document to PDF: {err}",
-                ) from err
-
-        html_pdf = tempdir / "html.pdf"
-        html_pdf.write_bytes(response.content)
-        return html_pdf
--- a/src/paperless/parsers/registry.py
+++ b/src/paperless/parsers/registry.py
@@ -193,17 +193,11 @@ class ParserRegistry:
        that log output is predictable; scoring determines which parser wins
        at runtime regardless of registration order.
        """
-        from paperless.parsers.mail import MailDocumentParser
-        from paperless.parsers.remote import RemoteDocumentParser
-        from paperless.parsers.tesseract import RasterisedDocumentParser
        from paperless.parsers.text import TextDocumentParser
        from paperless.parsers.tika import TikaDocumentParser

        self.register_builtin(TextDocumentParser)
-        self.register_builtin(RemoteDocumentParser)
        self.register_builtin(TikaDocumentParser)
-        self.register_builtin(MailDocumentParser)
-        self.register_builtin(RasterisedDocumentParser)

    # ------------------------------------------------------------------
    # Discovery
--- a/src/paperless/parsers/remote.py
+++ b/src/paperless/parsers/remote.py
@@ -1,433 +0,0 @@
-"""
-Built-in remote-OCR document parser.
-
-Handles documents by sending them to a configured remote OCR engine
-(currently Azure AI Vision / Document Intelligence) and retrieving both
-the extracted text and a searchable PDF with an embedded text layer.
-
-When no engine is configured, ``score()`` returns ``None`` so the parser
-is effectively invisible to the registry — the tesseract parser handles
-these MIME types instead.
-"""
-
-from __future__ import annotations
-
-import logging
-import shutil
-import tempfile
-from pathlib import Path
-from typing import TYPE_CHECKING
-from typing import Self
-
-from django.conf import settings
-
-from paperless.version import __full_version_str__
-
-if TYPE_CHECKING:
-    import datetime
-    from types import TracebackType
-
-    from paperless.parsers import MetadataEntry
-    from paperless.parsers import ParserContext
-
-logger = logging.getLogger("paperless.parsing.remote")
-
-_SUPPORTED_MIME_TYPES: dict[str, str] = {
-    "application/pdf": ".pdf",
-    "image/png": ".png",
-    "image/jpeg": ".jpg",
-    "image/tiff": ".tiff",
-    "image/bmp": ".bmp",
-    "image/gif": ".gif",
-    "image/webp": ".webp",
-}
-
-
-class RemoteEngineConfig:
-    """Holds and validates the remote OCR engine configuration."""
-
-    def __init__(
-        self,
-        engine: str | None,
-        api_key: str | None = None,
-        endpoint: str | None = None,
-    ) -> None:
-        self.engine = engine
-        self.api_key = api_key
-        self.endpoint = endpoint
-
-    def engine_is_valid(self) -> bool:
-        """Return True when the engine is known and fully configured."""
-        return (
-            self.engine in ("azureai",)
-            and self.api_key is not None
-            and not (self.engine == "azureai" and self.endpoint is None)
-        )
-
-
-class RemoteDocumentParser:
-    """Parse documents via a remote OCR API (currently Azure AI Vision).
-
-    This parser sends documents to a remote engine that returns both
-    extracted text and a searchable PDF with an embedded text layer.
-    It does not depend on Tesseract or ocrmypdf.
-
-    Class attributes
-    ----------------
-    name : str
-        Human-readable parser name.
-    version : str
-        Semantic version string, kept in sync with Paperless-ngx releases.
-    author : str
-        Maintainer name.
-    url : str
-        Issue tracker / source URL.
-    """
-
-    name: str = "Paperless-ngx Remote OCR Parser"
-    version: str = __full_version_str__
-    author: str = "Paperless-ngx Contributors"
-    url: str = "https://github.com/paperless-ngx/paperless-ngx"
-
-    # ------------------------------------------------------------------
-    # Class methods
-    # ------------------------------------------------------------------
-
-    @classmethod
-    def supported_mime_types(cls) -> dict[str, str]:
-        """Return the MIME types this parser can handle.
-
-        The full set is always returned regardless of whether a remote
-        engine is configured.  The ``score()`` method handles the
-        "am I active?" logic by returning ``None`` when not configured.
-
-        Returns
-        -------
-        dict[str, str]
-            Mapping of MIME type to preferred file extension.
-        """
-        return _SUPPORTED_MIME_TYPES
-
-    @classmethod
-    def score(
-        cls,
-        mime_type: str,
-        filename: str,
-        path: Path | None = None,
-    ) -> int | None:
-        """Return the priority score for handling this file, or None.
-
-        Returns ``None`` when no valid remote engine is configured,
-        making the parser invisible to the registry for this file.
-        When configured, returns 20 — higher than the Tesseract parser's
-        default of 10 — so the remote engine takes priority.
-
-        Parameters
-        ----------
-        mime_type:
-            Detected MIME type of the file.
-        filename:
-            Original filename including extension.
-        path:
-            Optional filesystem path. Not inspected by this parser.
-
-        Returns
-        -------
-        int | None
-            20 when the remote engine is configured and the MIME type is
-            supported, otherwise None.
-        """
-        config = RemoteEngineConfig(
-            engine=settings.REMOTE_OCR_ENGINE,
-            api_key=settings.REMOTE_OCR_API_KEY,
-            endpoint=settings.REMOTE_OCR_ENDPOINT,
-        )
-        if not config.engine_is_valid():
-            return None
-        if mime_type not in _SUPPORTED_MIME_TYPES:
-            return None
-        return 20
-
-    # ------------------------------------------------------------------
-    # Properties
-    # ------------------------------------------------------------------
-
-    @property
-    def can_produce_archive(self) -> bool:
-        """Whether this parser can produce a searchable PDF archive copy.
-
-        Returns
-        -------
-        bool
-            Always True — the remote engine always returns a PDF with an
-            embedded text layer that serves as the archive copy.
-        """
-        return True
-
-    @property
-    def requires_pdf_rendition(self) -> bool:
-        """Whether the parser must produce a PDF for the frontend to display.
-
-        Returns
-        -------
-        bool
-            Always False — all supported originals are displayable by
-            the browser (PDF) or handled via the archive copy (images).
-        """
-        return False
-
-    # ------------------------------------------------------------------
-    # Lifecycle
-    # ------------------------------------------------------------------
-
-    def __init__(self, logging_group: object = None) -> None:
-        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
-        self._tempdir = Path(
-            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
-        )
-        self._logging_group = logging_group
-        self._text: str | None = None
-        self._archive_path: Path | None = None
-
-    def __enter__(self) -> Self:
-        return self
-
-    def __exit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc_val: BaseException | None,
-        exc_tb: TracebackType | None,
-    ) -> None:
-        logger.debug("Cleaning up temporary directory %s", self._tempdir)
-        shutil.rmtree(self._tempdir, ignore_errors=True)
-
-    # ------------------------------------------------------------------
-    # Core parsing interface
-    # ------------------------------------------------------------------
-
-    def configure(self, context: ParserContext) -> None:
-        pass
-
-    def parse(
-        self,
-        document_path: Path,
-        mime_type: str,
-        *,
-        produce_archive: bool = True,
-    ) -> None:
-        """Send the document to the remote engine and store results.
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the document file to parse.
-        mime_type:
-            Detected MIME type of the document.
-        produce_archive:
-            Ignored — the remote engine always returns a searchable PDF,
-            which is stored as the archive copy regardless of this flag.
-        """
-        config = RemoteEngineConfig(
-            engine=settings.REMOTE_OCR_ENGINE,
-            api_key=settings.REMOTE_OCR_API_KEY,
-            endpoint=settings.REMOTE_OCR_ENDPOINT,
-        )
-
-        if not config.engine_is_valid():
-            logger.warning(
-                "No valid remote parser engine is configured, content will be empty.",
-            )
-            self._text = ""
-            return
-
-        if config.engine == "azureai":
-            self._text = self._azure_ai_vision_parse(document_path, config)
-
-    # ------------------------------------------------------------------
-    # Result accessors
-    # ------------------------------------------------------------------
-
-    def get_text(self) -> str | None:
-        """Return the plain-text content extracted during parse."""
-        return self._text
-
-    def get_date(self) -> datetime.datetime | None:
-        """Return the document date detected during parse.
-
-        Returns
-        -------
-        datetime.datetime | None
-            Always None — the remote parser does not detect dates.
-        """
-        return None
-
-    def get_archive_path(self) -> Path | None:
-        """Return the path to the generated archive PDF, or None."""
-        return self._archive_path
-
-    # ------------------------------------------------------------------
-    # Thumbnail and metadata
-    # ------------------------------------------------------------------
-
-    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
-        """Generate a thumbnail image for the document.
-
-        Uses the archive PDF produced by the remote engine when available,
-        otherwise falls back to the original document path (PDF inputs).
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the source document.
-        mime_type:
-            Detected MIME type of the document.
-
-        Returns
-        -------
-        Path
-            Path to the generated WebP thumbnail inside the temp directory.
-        """
-        # make_thumbnail_from_pdf lives in documents.parsers for now;
-        # it will move to paperless.parsers.utils when the tesseract
-        # parser is migrated in a later phase.
-        from documents.parsers import make_thumbnail_from_pdf
-
-        return make_thumbnail_from_pdf(
-            self._archive_path or document_path,
-            self._tempdir,
-            self._logging_group,
-        )
-
-    def get_page_count(
-        self,
-        document_path: Path,
-        mime_type: str,
-    ) -> int | None:
-        """Return the number of pages in a PDF document.
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the source document.
-        mime_type:
-            Detected MIME type of the document.
-
-        Returns
-        -------
-        int | None
-            Page count for PDF inputs, or ``None`` for other MIME types.
-        """
-        if mime_type != "application/pdf":
-            return None
-
-        from paperless.parsers.utils import get_page_count_for_pdf
-
-        return get_page_count_for_pdf(document_path, log=logger)
-
-    def extract_metadata(
-        self,
-        document_path: Path,
-        mime_type: str,
-    ) -> list[MetadataEntry]:
-        """Extract format-specific metadata from the document.
-
-        Delegates to the shared pikepdf-based extractor for PDF files.
-        Returns ``[]`` for all other MIME types.
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the file to extract metadata from.
-        mime_type:
-            MIME type of the file.  May be ``"application/pdf"`` when
-            called for the archive version of an image original.
-
-        Returns
-        -------
-        list[MetadataEntry]
-            Zero or more metadata entries.
-        """
-        if mime_type != "application/pdf":
-            return []
-
-        from paperless.parsers.utils import extract_pdf_metadata
-
-        return extract_pdf_metadata(document_path, log=logger)
-
-    # ------------------------------------------------------------------
-    # Private helpers
-    # ------------------------------------------------------------------
-
-    def _azure_ai_vision_parse(
-        self,
-        file: Path,
-        config: RemoteEngineConfig,
-    ) -> str | None:
-        """Send ``file`` to Azure AI Document Intelligence and return text.
-
-        Downloads the searchable PDF output from Azure and stores it at
-        ``self._archive_path``.  Returns the extracted text content, or
-        ``None`` on failure (the error is logged).
-
-        Parameters
-        ----------
-        file:
-            Absolute path to the document to analyse.
-        config:
-            Validated remote engine configuration.
-
-        Returns
-        -------
-        str | None
-            Extracted text, or None if the Azure call failed.
-        """
-        if TYPE_CHECKING:
-            # Callers must have already validated config via engine_is_valid():
-            # engine_is_valid() asserts api_key is not None and (for azureai)
-            # endpoint is not None, so these casts are provably safe.
-            assert config.endpoint is not None
-            assert config.api_key is not None
-
-        from azure.ai.documentintelligence import DocumentIntelligenceClient
-        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
-        from azure.ai.documentintelligence.models import AnalyzeOutputOption
-        from azure.ai.documentintelligence.models import DocumentContentFormat
-        from azure.core.credentials import AzureKeyCredential
-
-        client = DocumentIntelligenceClient(
-            endpoint=config.endpoint,
-            credential=AzureKeyCredential(config.api_key),
-        )
-
-        try:
-            with file.open("rb") as f:
-                analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
-                poller = client.begin_analyze_document(
-                    model_id="prebuilt-read",
-                    body=analyze_request,
-                    output_content_format=DocumentContentFormat.TEXT,
-                    output=[AnalyzeOutputOption.PDF],
-                    content_type="application/json",
-                )
-
-            poller.wait()
-            result_id = poller.details["operation_id"]
-            result = poller.result()
-
-            self._archive_path = self._tempdir / "archive.pdf"
-            with self._archive_path.open("wb") as f:
-                for chunk in client.get_analyze_result_pdf(
-                    model_id="prebuilt-read",
-                    result_id=result_id,
-                ):
-                    f.write(chunk)
-
-            return result.content
-
-        except Exception as e:
-            logger.error("Azure AI Vision parsing failed: %s", e)
-
-        finally:
-            client.close()
-
-        return None
--- a/src/paperless/parsers/text.py
+++ b/src/paperless/parsers/text.py
@@ -27,7 +27,6 @@ if TYPE_CHECKING:
    from types import TracebackType

    from paperless.parsers import MetadataEntry
-    from paperless.parsers import ParserContext

 logger = logging.getLogger("paperless.parsing.text")

@@ -157,9 +156,6 @@ class TextDocumentParser:
    # Core parsing interface
    # ------------------------------------------------------------------

-    def configure(self, context: ParserContext) -> None:
-        pass
-
    def parse(
        self,
        document_path: Path,
--- a/src/paperless/parsers/tika.py
+++ b/src/paperless/parsers/tika.py
@@ -35,7 +35,6 @@ if TYPE_CHECKING:
    from types import TracebackType

    from paperless.parsers import MetadataEntry
-    from paperless.parsers import ParserContext

 logger = logging.getLogger("paperless.parsing.tika")

@@ -206,9 +205,6 @@ class TikaDocumentParser:
    # Core parsing interface
    # ------------------------------------------------------------------

-    def configure(self, context: ParserContext) -> None:
-        pass
-
    def parse(
        self,
        document_path: Path,
@@ -344,19 +340,11 @@ class TikaDocumentParser:
    ) -> int | None:
        """Return the number of pages in the document.

-        Counts pages in the archive PDF produced by a preceding parse()
-        call.  Returns ``None`` if parse() has not been called yet or if
-        no archive was produced.
-
        Returns
        -------
        int | None
-            Page count of the archive PDF, or ``None``.
+            Always None — page count is not available from Tika.
        """
-        if self._archive_path is not None:
-            from paperless.parsers.utils import get_page_count_for_pdf
-
-            return get_page_count_for_pdf(self._archive_path, log=logger)
        return None

    def extract_metadata(
--- a/src/paperless/parsers/utils.py
+++ b/src/paperless/parsers/utils.py
@@ -1,158 +0,0 @@
-"""
-Shared utilities for Paperless-ngx document parsers.
-
-Functions here are format-neutral helpers that multiple parsers need.
-Keeping them here avoids parsers inheriting from each other just to
-share implementation.
-"""
-
-from __future__ import annotations
-
-import logging
-import re
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-    from paperless.parsers import MetadataEntry
-
-logger = logging.getLogger("paperless.parsers.utils")
-
-
-def read_file_handle_unicode_errors(
-    filepath: Path,
-    log: logging.Logger | None = None,
-) -> str:
-    """Read a file as UTF-8 text, replacing invalid bytes rather than raising.
-
-    Parameters
-    ----------
-    filepath:
-        Absolute path to the file to read.
-    log:
-        Logger to use for warnings.  Falls back to the module-level logger
-        when omitted.
-
-    Returns
-    -------
-    str
-        File content as a string, with any invalid UTF-8 sequences replaced
-        by the Unicode replacement character.
-    """
-    _log = log or logger
-    try:
-        return filepath.read_text(encoding="utf-8")
-    except UnicodeDecodeError as e:
-        _log.warning("Unicode error during text reading, continuing: %s", e)
-        return filepath.read_bytes().decode("utf-8", errors="replace")
-
-
-def get_page_count_for_pdf(
-    document_path: Path,
-    log: logging.Logger | None = None,
-) -> int | None:
-    """Return the number of pages in a PDF file using pikepdf.
-
-    Parameters
-    ----------
-    document_path:
-        Absolute path to the PDF file.
-    log:
-        Logger to use for warnings.  Falls back to the module-level logger
-        when omitted.
-
-    Returns
-    -------
-    int | None
-        Page count, or ``None`` if the file cannot be opened or is not a
-        valid PDF.
-    """
-    import pikepdf
-
-    _log = log or logger
-
-    try:
-        with pikepdf.Pdf.open(document_path) as pdf:
-            return len(pdf.pages)
-    except Exception as e:
-        _log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
-        return None
-
-
-def extract_pdf_metadata(
-    document_path: Path,
-    log: logging.Logger | None = None,
-) -> list[MetadataEntry]:
-    """Extract XMP/PDF metadata from a PDF file using pikepdf.
-
-    Reads all XMP metadata entries from the document and returns them as a
-    list of ``MetadataEntry`` dicts.  The method never raises — any failure
-    to open the file or read a specific key is logged and skipped.
-
-    Parameters
-    ----------
-    document_path:
-        Absolute path to the PDF file.
-    log:
-        Logger to use for warnings and debug messages.  Falls back to the
-        module-level logger when omitted.
-
-    Returns
-    -------
-    list[MetadataEntry]
-        Zero or more metadata entries.  Returns ``[]`` if the file cannot
-        be opened or contains no readable XMP metadata.
-    """
-    import pikepdf
-
-    from paperless.parsers import MetadataEntry
-
-    _log = log or logger
-    result: list[MetadataEntry] = []
-    namespace_pattern = re.compile(r"\{(.*)\}(.*)")
-
-    try:
-        pdf = pikepdf.open(document_path)
-        meta = pdf.open_metadata()
-    except Exception as e:
-        _log.warning("Could not open PDF metadata for %s: %s", document_path, e)
-        return []
-
-    for key, value in meta.items():
-        if isinstance(value, list):
-            value = " ".join(str(e) for e in value)
-        value = str(value)
-
-        try:
-            m = namespace_pattern.match(key)
-            if m is None:
-                continue
-
-            namespace = m.group(1)
-            key_value = m.group(2)
-
-            try:
-                namespace.encode("utf-8")
-                key_value.encode("utf-8")
-            except UnicodeEncodeError as enc_err:
-                _log.debug("Skipping metadata key %s: %s", key, enc_err)
-                continue
-
-            result.append(
-                MetadataEntry(
-                    namespace=namespace,
-                    prefix=meta.REVERSE_NS[namespace],
-                    key=key_value,
-                    value=value,
-                ),
-            )
-        except Exception as e:
-            _log.warning(
-                "Error reading metadata key %s value %s: %s",
-                key,
-                value,
-                e,
-            )
-
-    return result
--- a/src/paperless/tests/parsers/conftest.py
+++ b/src/paperless/tests/parsers/conftest.py
@@ -6,29 +6,16 @@ so it is easy to see which files belong to which test module.

 from __future__ import annotations

-from contextlib import contextmanager
 from typing import TYPE_CHECKING

 import pytest
-from django.test import override_settings

-from paperless.parsers.mail import MailDocumentParser
-from paperless.parsers.remote import RemoteDocumentParser
-from paperless.parsers.tesseract import RasterisedDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser

 if TYPE_CHECKING:
-    from collections.abc import Callable
    from collections.abc import Generator
    from pathlib import Path
-    from unittest.mock import MagicMock
-
-    from pytest_django.fixtures import SettingsWrapper
-    from pytest_mock import MockerFixture
-
-    #: Type for the ``make_tesseract_parser`` fixture factory.
-    MakeTesseractParser = Callable[..., Generator[RasterisedDocumentParser, None, None]]


 # ------------------------------------------------------------------
@@ -90,92 +77,6 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
        yield parser


-# ------------------------------------------------------------------
-# Remote parser sample files
-# ------------------------------------------------------------------
-
-
-@pytest.fixture(scope="session")
-def remote_samples_dir(samples_dir: Path) -> Path:
-    """Absolute path to the remote parser sample files directory.
-
-    Returns
-    -------
-    Path
-        ``<samples_dir>/remote/``
-    """
-    return samples_dir / "remote"
-
-
-@pytest.fixture(scope="session")
-def sample_pdf_file(remote_samples_dir: Path) -> Path:
-    """Path to a simple digital PDF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``remote/simple-digital.pdf``.
-    """
-    return remote_samples_dir / "simple-digital.pdf"
-
-
-# ------------------------------------------------------------------
-# Remote parser instance
-# ------------------------------------------------------------------
-
-
-@pytest.fixture()
-def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
-    """Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
-
-    Yields
-    ------
-    RemoteDocumentParser
-        A ready-to-use parser instance.
-    """
-    with RemoteDocumentParser() as parser:
-        yield parser
-
-
-# ------------------------------------------------------------------
-# Remote parser settings helpers
-# ------------------------------------------------------------------
-
-
-@pytest.fixture()
-def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
-    """Configure Django settings for a valid Azure AI OCR engine.
-
-    Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
-    ``REMOTE_OCR_ENDPOINT`` to test values.  Settings are restored
-    automatically after the test by pytest-django.
-
-    Returns
-    -------
-    SettingsWrapper
-        The modified settings object (for chaining further overrides).
-    """
-    settings.REMOTE_OCR_ENGINE = "azureai"
-    settings.REMOTE_OCR_API_KEY = "test-api-key"
-    settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
-    return settings
-
-
-@pytest.fixture()
-def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
-    """Configure Django settings with no remote engine configured.
-
-    Returns
-    -------
-    SettingsWrapper
-        The modified settings object.
-    """
-    settings.REMOTE_OCR_ENGINE = None
-    settings.REMOTE_OCR_API_KEY = None
-    settings.REMOTE_OCR_ENDPOINT = None
-    return settings
-
-
 # ------------------------------------------------------------------
 # Tika parser sample files
 # ------------------------------------------------------------------
@@ -257,544 +158,3 @@ def tika_parser() -> Generator[TikaDocumentParser, None, None]:
    """
    with TikaDocumentParser() as parser:
        yield parser
-
-
-# ------------------------------------------------------------------
-# Mail parser sample files
-# ------------------------------------------------------------------
-
-
-@pytest.fixture(scope="session")
-def mail_samples_dir(samples_dir: Path) -> Path:
-    """Absolute path to the mail parser sample files directory.
-
-    Returns
-    -------
-    Path
-        ``<samples_dir>/mail/``
-    """
-    return samples_dir / "mail"
-
-
-@pytest.fixture(scope="session")
-def broken_email_file(mail_samples_dir: Path) -> Path:
-    """Path to a broken/malformed EML sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/broken.eml``.
-    """
-    return mail_samples_dir / "broken.eml"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_file(mail_samples_dir: Path) -> Path:
-    """Path to a plain-text email sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/simple_text.eml``.
-    """
-    return mail_samples_dir / "simple_text.eml"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_pdf_file(mail_samples_dir: Path) -> Path:
-    """Path to the expected PDF rendition of the plain-text email.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/simple_text.eml.pdf``.
-    """
-    return mail_samples_dir / "simple_text.eml.pdf"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_thumbnail_file(mail_samples_dir: Path) -> Path:
-    """Path to the expected thumbnail for the plain-text email.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/simple_text.eml.pdf.webp``.
-    """
-    return mail_samples_dir / "simple_text.eml.pdf.webp"
-
-
-@pytest.fixture(scope="session")
-def html_email_file(mail_samples_dir: Path) -> Path:
-    """Path to an HTML email sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/html.eml``.
-    """
-    return mail_samples_dir / "html.eml"
-
-
-@pytest.fixture(scope="session")
-def html_email_pdf_file(mail_samples_dir: Path) -> Path:
-    """Path to the expected PDF rendition of the HTML email.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/html.eml.pdf``.
-    """
-    return mail_samples_dir / "html.eml.pdf"
-
-
-@pytest.fixture(scope="session")
-def html_email_thumbnail_file(mail_samples_dir: Path) -> Path:
-    """Path to the expected thumbnail for the HTML email.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/html.eml.pdf.webp``.
-    """
-    return mail_samples_dir / "html.eml.pdf.webp"
-
-
-@pytest.fixture(scope="session")
-def html_email_html_file(mail_samples_dir: Path) -> Path:
-    """Path to the HTML body of the HTML email sample.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/html.eml.html``.
-    """
-    return mail_samples_dir / "html.eml.html"
-
-
-@pytest.fixture(scope="session")
-def merged_pdf_first(mail_samples_dir: Path) -> Path:
-    """Path to the first PDF used in PDF-merge tests.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/first.pdf``.
-    """
-    return mail_samples_dir / "first.pdf"
-
-
-@pytest.fixture(scope="session")
-def merged_pdf_second(mail_samples_dir: Path) -> Path:
-    """Path to the second PDF used in PDF-merge tests.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``mail/second.pdf``.
-    """
-    return mail_samples_dir / "second.pdf"
-
-
-# ------------------------------------------------------------------
-# Mail parser instance
-# ------------------------------------------------------------------
-
-
-@pytest.fixture()
-def mail_parser() -> Generator[MailDocumentParser, None, None]:
-    """Yield a MailDocumentParser and clean up its temporary directory afterwards.
-
-    Yields
-    ------
-    MailDocumentParser
-        A ready-to-use parser instance.
-    """
-    with MailDocumentParser() as parser:
-        yield parser
-
-
-@pytest.fixture(scope="session")
-def nginx_base_url() -> Generator[str, None, None]:
-    """
-    The base URL for the nginx HTTP server we expect to be alive
-    """
-    yield "http://localhost:8080"
-
-
-# ------------------------------------------------------------------
-# Tesseract parser sample files
-# ------------------------------------------------------------------
-
-
-@pytest.fixture(scope="session")
-def tesseract_samples_dir(samples_dir: Path) -> Path:
-    """Absolute path to the tesseract parser sample files directory.
-
-    Returns
-    -------
-    Path
-        ``<samples_dir>/tesseract/``
-    """
-    return samples_dir / "tesseract"
-
-
-@pytest.fixture(scope="session")
-def document_webp_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a WebP document sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/document.webp``.
-    """
-    return tesseract_samples_dir / "document.webp"
-
-
-@pytest.fixture(scope="session")
-def encrypted_pdf_file(tesseract_samples_dir: Path) -> Path:
-    """Path to an encrypted PDF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/encrypted.pdf``.
-    """
-    return tesseract_samples_dir / "encrypted.pdf"
-
-
-@pytest.fixture(scope="session")
-def multi_page_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a multi-page digital PDF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/multi-page-digital.pdf``.
-    """
-    return tesseract_samples_dir / "multi-page-digital.pdf"
-
-
-@pytest.fixture(scope="session")
-def multi_page_images_alpha_rgb_tiff_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a multi-page TIFF with alpha channel in RGB.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/multi-page-images-alpha-rgb.tiff``.
-    """
-    return tesseract_samples_dir / "multi-page-images-alpha-rgb.tiff"
-
-
-@pytest.fixture(scope="session")
-def multi_page_images_alpha_tiff_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a multi-page TIFF with alpha channel.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/multi-page-images-alpha.tiff``.
-    """
-    return tesseract_samples_dir / "multi-page-images-alpha.tiff"
-
-
-@pytest.fixture(scope="session")
-def multi_page_images_pdf_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a multi-page PDF with images.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/multi-page-images.pdf``.
-    """
-    return tesseract_samples_dir / "multi-page-images.pdf"
-
-
-@pytest.fixture(scope="session")
-def multi_page_images_tiff_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a multi-page TIFF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/multi-page-images.tiff``.
-    """
-    return tesseract_samples_dir / "multi-page-images.tiff"
-
-
-@pytest.fixture(scope="session")
-def multi_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a multi-page mixed PDF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/multi-page-mixed.pdf``.
-    """
-    return tesseract_samples_dir / "multi-page-mixed.pdf"
-
-
-@pytest.fixture(scope="session")
-def no_text_alpha_png_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a PNG with alpha channel and no text.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/no-text-alpha.png``.
-    """
-    return tesseract_samples_dir / "no-text-alpha.png"
-
-
-@pytest.fixture(scope="session")
-def rotated_pdf_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a rotated PDF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/rotated.pdf``.
-    """
-    return tesseract_samples_dir / "rotated.pdf"
-
-
-@pytest.fixture(scope="session")
-def rtl_test_pdf_file(tesseract_samples_dir: Path) -> Path:
-    """Path to an RTL test PDF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/rtl-test.pdf``.
-    """
-    return tesseract_samples_dir / "rtl-test.pdf"
-
-
-@pytest.fixture(scope="session")
-def signed_pdf_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a signed PDF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/signed.pdf``.
-    """
-    return tesseract_samples_dir / "signed.pdf"
-
-
-@pytest.fixture(scope="session")
-def simple_alpha_png_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a simple PNG with alpha channel.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/simple-alpha.png``.
-    """
-    return tesseract_samples_dir / "simple-alpha.png"
-
-
-@pytest.fixture(scope="session")
-def simple_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a simple digital PDF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/simple-digital.pdf``.
-    """
-    return tesseract_samples_dir / "simple-digital.pdf"
-
-
-@pytest.fixture(scope="session")
-def simple_no_dpi_png_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a simple PNG without DPI information.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/simple-no-dpi.png``.
-    """
-    return tesseract_samples_dir / "simple-no-dpi.png"
-
-
-@pytest.fixture(scope="session")
-def simple_bmp_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a simple BMP sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/simple.bmp``.
-    """
-    return tesseract_samples_dir / "simple.bmp"
-
-
-@pytest.fixture(scope="session")
-def simple_gif_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a simple GIF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/simple.gif``.
-    """
-    return tesseract_samples_dir / "simple.gif"
-
-
-@pytest.fixture(scope="session")
-def simple_heic_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a simple HEIC sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/simple.heic``.
-    """
-    return tesseract_samples_dir / "simple.heic"
-
-
-@pytest.fixture(scope="session")
-def simple_jpg_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a simple JPG sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/simple.jpg``.
-    """
-    return tesseract_samples_dir / "simple.jpg"
-
-
-@pytest.fixture(scope="session")
-def simple_png_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a simple PNG sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/simple.png``.
-    """
-    return tesseract_samples_dir / "simple.png"
-
-
-@pytest.fixture(scope="session")
-def simple_tif_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a simple TIF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/simple.tif``.
-    """
-    return tesseract_samples_dir / "simple.tif"
-
-
-@pytest.fixture(scope="session")
-def single_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a single-page mixed PDF sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/single-page-mixed.pdf``.
-    """
-    return tesseract_samples_dir / "single-page-mixed.pdf"
-
-
-@pytest.fixture(scope="session")
-def with_form_pdf_file(tesseract_samples_dir: Path) -> Path:
-    """Path to a PDF with form sample file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tesseract/with-form.pdf``.
-    """
-    return tesseract_samples_dir / "with-form.pdf"
-
-
-# ------------------------------------------------------------------
-# Tesseract parser instance and settings helpers
-# ------------------------------------------------------------------
-
-
-@pytest.fixture()
-def null_app_config(mocker: MockerFixture) -> MagicMock:
-    """Return a MagicMock with all OcrConfig fields set to None.
-
-    This allows the parser to fall back to Django settings instead of
-    hitting the database.
-
-    Returns
-    -------
-    MagicMock
-        Mock config with all fields as None
-    """
-    return mocker.MagicMock(
-        output_type=None,
-        pages=None,
-        language=None,
-        mode=None,
-        skip_archive_file=None,
-        image_dpi=None,
-        unpaper_clean=None,
-        deskew=None,
-        rotate_pages=None,
-        rotate_pages_threshold=None,
-        max_image_pixels=None,
-        color_conversion_strategy=None,
-        user_args=None,
-    )
-
-
-@pytest.fixture()
-def tesseract_parser(
-    mocker: MockerFixture,
-    null_app_config: MagicMock,
-) -> Generator[RasterisedDocumentParser, None, None]:
-    """Yield a RasterisedDocumentParser and clean up its temporary directory afterwards.
-
-    Patches the config system to avoid database access.
-
-    Yields
-    ------
-    RasterisedDocumentParser
-        A ready-to-use parser instance.
-    """
-    mocker.patch(
-        "paperless.config.BaseConfig._get_config_instance",
-        return_value=null_app_config,
-    )
-    with RasterisedDocumentParser() as parser:
-        yield parser
-
-
-@pytest.fixture()
-def make_tesseract_parser(
-    mocker: MockerFixture,
-    null_app_config: MagicMock,
-) -> MakeTesseractParser:
-    """Return a factory for creating RasterisedDocumentParser with Django settings overrides.
-
-    This fixture is useful for tests that need to create parsers with different
-    settings configurations.
-
-    Returns
-    -------
-    Callable[..., contextmanager[RasterisedDocumentParser]]
-        A context manager factory that accepts Django settings overrides
-    """
-    mocker.patch(
-        "paperless.config.BaseConfig._get_config_instance",
-        return_value=null_app_config,
-    )
-
-    @contextmanager
-    def _make_parser(**django_settings_overrides):
-        with override_settings(**django_settings_overrides):
-            with RasterisedDocumentParser() as parser:
-                yield parser
-
-    return _make_parser
--- a/src/paperless/tests/parsers/test_remote_parser.py
+++ b/src/paperless/tests/parsers/test_remote_parser.py
@@ -1,497 +0,0 @@
-"""
-Tests for paperless.parsers.remote.RemoteDocumentParser.
-
-All tests use the context-manager protocol for parser lifecycle.
-
-Fixture layout
--------------
-make_azure_mock  — factory (defined here; specific to this module)
-azure_client     — composes azure_settings + make_azure_mock + patch;
-                   use when a test needs the client to succeed
-failing_azure_client
-                 — composes azure_settings + patch with RuntimeError;
-                   use when a test needs the client to fail
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-from unittest.mock import Mock
-
-import pytest
-
-from paperless.parsers import ParserContext
-from paperless.parsers import ParserProtocol
-from paperless.parsers.remote import RemoteDocumentParser
-
-if TYPE_CHECKING:
-    from collections.abc import Callable
-    from pathlib import Path
-
-    from pytest_django.fixtures import SettingsWrapper
-    from pytest_mock import MockerFixture
-
-
-# ---------------------------------------------------------------------------
-# Module-local fixtures
-# ---------------------------------------------------------------------------
-
-_AZURE_CLIENT_TARGET = "azure.ai.documentintelligence.DocumentIntelligenceClient"
-_DEFAULT_TEXT = "Extracted text."
-
-
-@pytest.fixture()
-def make_azure_mock() -> Callable[[str], Mock]:
-    """Return a factory that builds a mock Azure DocumentIntelligenceClient.
-
-    Usage::
-
-        mock_client = make_azure_mock()            # default extracted text
-        mock_client = make_azure_mock("My text.")  # custom extracted text
-    """
-
-    def _factory(text: str = _DEFAULT_TEXT) -> Mock:
-        mock_client = Mock()
-        mock_poller = Mock()
-        mock_poller.wait.return_value = None
-        mock_poller.details = {"operation_id": "fake-op-id"}
-        mock_poller.result.return_value.content = text
-        mock_client.begin_analyze_document.return_value = mock_poller
-        mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
-        return mock_client
-
-    return _factory
-
-
-@pytest.fixture()
-def azure_client(
-    azure_settings: SettingsWrapper,
-    make_azure_mock: Callable[[str], Mock],
-    mocker: MockerFixture,
-) -> Mock:
-    """Patch the Azure DI client with a succeeding mock and return the instance.
-
-    Implicitly applies ``azure_settings`` so tests using this fixture do not
-    also need ``@pytest.mark.usefixtures("azure_settings")``.
-    """
-    mock_client = make_azure_mock()
-    mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
-    return mock_client
-
-
-@pytest.fixture()
-def failing_azure_client(
-    azure_settings: SettingsWrapper,
-    mocker: MockerFixture,
-) -> Mock:
-    """Patch the Azure DI client to raise RuntimeError on every call.
-
-    Implicitly applies ``azure_settings``.  Returns the mock instance so
-    tests can assert on calls such as ``close()``.
-    """
-    mock_client = Mock()
-    mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
-    mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
-    return mock_client
-
-
-# ---------------------------------------------------------------------------
-# Protocol contract
-# ---------------------------------------------------------------------------
-
-
-class TestRemoteParserProtocol:
-    """Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""
-
-    def test_isinstance_satisfies_protocol(
-        self,
-        remote_parser: RemoteDocumentParser,
-    ) -> None:
-        assert isinstance(remote_parser, ParserProtocol)
-
-    def test_class_attributes_present(self) -> None:
-        assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
-        assert (
-            isinstance(RemoteDocumentParser.version, str)
-            and RemoteDocumentParser.version
-        )
-        assert (
-            isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
-        )
-        assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url
-
-
-# ---------------------------------------------------------------------------
-# supported_mime_types
-# ---------------------------------------------------------------------------
-
-
-class TestRemoteParserSupportedMimeTypes:
-    """supported_mime_types() always returns the full set regardless of config."""
-
-    def test_returns_dict(self) -> None:
-        mime_types = RemoteDocumentParser.supported_mime_types()
-        assert isinstance(mime_types, dict)
-
-    def test_includes_all_expected_types(self) -> None:
-        mime_types = RemoteDocumentParser.supported_mime_types()
-        expected = {
-            "application/pdf",
-            "image/png",
-            "image/jpeg",
-            "image/tiff",
-            "image/bmp",
-            "image/gif",
-            "image/webp",
-        }
-        assert expected == set(mime_types.keys())
-
-    @pytest.mark.usefixtures("no_engine_settings")
-    def test_returns_full_set_when_not_configured(self) -> None:
-        """
-        GIVEN: No remote engine is configured
-        WHEN:  supported_mime_types() is called
-        THEN:  The full MIME type dict is still returned (score() handles activation)
-        """
-        mime_types = RemoteDocumentParser.supported_mime_types()
-        assert len(mime_types) == 7
-
-
-# ---------------------------------------------------------------------------
-# score()
-# ---------------------------------------------------------------------------
-
-
-class TestRemoteParserScore:
-    """score() encodes the activation logic: None when unconfigured, 20 when active."""
-
-    @pytest.mark.usefixtures("azure_settings")
-    @pytest.mark.parametrize(
-        "mime_type",
-        [
-            pytest.param("application/pdf", id="pdf"),
-            pytest.param("image/png", id="png"),
-            pytest.param("image/jpeg", id="jpeg"),
-            pytest.param("image/tiff", id="tiff"),
-            pytest.param("image/bmp", id="bmp"),
-            pytest.param("image/gif", id="gif"),
-            pytest.param("image/webp", id="webp"),
-        ],
-    )
-    def test_score_returns_20_when_configured(self, mime_type: str) -> None:
-        result = RemoteDocumentParser.score(mime_type, "doc.pdf")
-        assert result == 20
-
-    @pytest.mark.usefixtures("no_engine_settings")
-    @pytest.mark.parametrize(
-        "mime_type",
-        [
-            pytest.param("application/pdf", id="pdf"),
-            pytest.param("image/png", id="png"),
-            pytest.param("image/jpeg", id="jpeg"),
-        ],
-    )
-    def test_score_returns_none_when_no_engine(self, mime_type: str) -> None:
-        result = RemoteDocumentParser.score(mime_type, "doc.pdf")
-        assert result is None
-
-    def test_score_returns_none_when_api_key_missing(
-        self,
-        settings: SettingsWrapper,
-    ) -> None:
-        settings.REMOTE_OCR_ENGINE = "azureai"
-        settings.REMOTE_OCR_API_KEY = None
-        settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
-        result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
-        assert result is None
-
-    def test_score_returns_none_when_endpoint_missing(
-        self,
-        settings: SettingsWrapper,
-    ) -> None:
-        settings.REMOTE_OCR_ENGINE = "azureai"
-        settings.REMOTE_OCR_API_KEY = "key"
-        settings.REMOTE_OCR_ENDPOINT = None
-        result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
-        assert result is None
-
-    @pytest.mark.usefixtures("azure_settings")
-    def test_score_returns_none_for_unsupported_mime_type(self) -> None:
-        result = RemoteDocumentParser.score("text/plain", "doc.txt")
-        assert result is None
-
-    @pytest.mark.usefixtures("azure_settings")
-    def test_score_higher_than_tesseract_default(self) -> None:
-        """Remote parser (20) outranks the tesseract default (10) when configured."""
-        score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
-        assert score is not None and score > 10
-
-
-# ---------------------------------------------------------------------------
-# Properties
-# ---------------------------------------------------------------------------
-
-
-class TestRemoteParserProperties:
-    def test_can_produce_archive_is_true(
-        self,
-        remote_parser: RemoteDocumentParser,
-    ) -> None:
-        assert remote_parser.can_produce_archive is True
-
-    def test_requires_pdf_rendition_is_false(
-        self,
-        remote_parser: RemoteDocumentParser,
-    ) -> None:
-        assert remote_parser.requires_pdf_rendition is False
-
-
-# ---------------------------------------------------------------------------
-# Lifecycle
-# ---------------------------------------------------------------------------
-
-
-class TestRemoteParserLifecycle:
-    def test_context_manager_cleans_up_tempdir(self) -> None:
-        with RemoteDocumentParser() as parser:
-            tempdir = parser._tempdir
-            assert tempdir.exists()
-        assert not tempdir.exists()
-
-    def test_context_manager_cleans_up_after_exception(self) -> None:
-        tempdir: Path | None = None
-        with pytest.raises(RuntimeError):
-            with RemoteDocumentParser() as parser:
-                tempdir = parser._tempdir
-                raise RuntimeError("boom")
-        assert tempdir is not None
-        assert not tempdir.exists()
-
-
-# ---------------------------------------------------------------------------
-# parse() — happy path
-# ---------------------------------------------------------------------------
-
-
-class TestRemoteParserParse:
-    def test_parse_returns_text_from_azure(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-        azure_client: Mock,
-    ) -> None:
-        remote_parser.parse(sample_pdf_file, "application/pdf")
-
-        assert remote_parser.get_text() == _DEFAULT_TEXT
-
-    def test_parse_sets_archive_path(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-        azure_client: Mock,
-    ) -> None:
-        remote_parser.parse(sample_pdf_file, "application/pdf")
-
-        archive = remote_parser.get_archive_path()
-        assert archive is not None
-        assert archive.exists()
-        assert archive.suffix == ".pdf"
-
-    def test_parse_closes_client_on_success(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-        azure_client: Mock,
-    ) -> None:
-        remote_parser.configure(ParserContext())
-        remote_parser.parse(sample_pdf_file, "application/pdf")
-
-        azure_client.close.assert_called_once()
-
-    @pytest.mark.usefixtures("no_engine_settings")
-    def test_parse_sets_empty_text_when_not_configured(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-    ) -> None:
-        remote_parser.parse(sample_pdf_file, "application/pdf")
-
-        assert remote_parser.get_text() == ""
-        assert remote_parser.get_archive_path() is None
-
-    def test_get_text_none_before_parse(
-        self,
-        remote_parser: RemoteDocumentParser,
-    ) -> None:
-        assert remote_parser.get_text() is None
-
-    def test_get_date_always_none(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-        azure_client: Mock,
-    ) -> None:
-        remote_parser.parse(sample_pdf_file, "application/pdf")
-
-        assert remote_parser.get_date() is None
-
-
-# ---------------------------------------------------------------------------
-# parse() — Azure failure path
-# ---------------------------------------------------------------------------
-
-
-class TestRemoteParserParseError:
-    def test_parse_returns_none_on_azure_error(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-        failing_azure_client: Mock,
-    ) -> None:
-        remote_parser.parse(sample_pdf_file, "application/pdf")
-
-        assert remote_parser.get_text() is None
-
-    def test_parse_closes_client_on_error(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-        failing_azure_client: Mock,
-    ) -> None:
-        remote_parser.parse(sample_pdf_file, "application/pdf")
-
-        failing_azure_client.close.assert_called_once()
-
-    def test_parse_logs_error_on_azure_failure(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-        failing_azure_client: Mock,
-        mocker: MockerFixture,
-    ) -> None:
-        mock_log = mocker.patch("paperless.parsers.remote.logger")
-
-        remote_parser.parse(sample_pdf_file, "application/pdf")
-
-        mock_log.error.assert_called_once()
-        assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
-
-
-# ---------------------------------------------------------------------------
-# get_page_count()
-# ---------------------------------------------------------------------------
-
-
-class TestRemoteParserPageCount:
-    def test_page_count_for_pdf(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-    ) -> None:
-        count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
-        assert isinstance(count, int)
-        assert count >= 1
-
-    def test_page_count_returns_none_for_image_mime(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-    ) -> None:
-        count = remote_parser.get_page_count(sample_pdf_file, "image/png")
-        assert count is None
-
-    def test_page_count_returns_none_for_invalid_pdf(
-        self,
-        remote_parser: RemoteDocumentParser,
-        tmp_path: Path,
-    ) -> None:
-        bad_pdf = tmp_path / "bad.pdf"
-        bad_pdf.write_bytes(b"not a pdf at all")
-        count = remote_parser.get_page_count(bad_pdf, "application/pdf")
-        assert count is None
-
-
-# ---------------------------------------------------------------------------
-# extract_metadata()
-# ---------------------------------------------------------------------------
-
-
-class TestRemoteParserMetadata:
-    def test_extract_metadata_non_pdf_returns_empty(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-    ) -> None:
-        result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
-        assert result == []
-
-    def test_extract_metadata_pdf_returns_list(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-    ) -> None:
-        result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
-        assert isinstance(result, list)
-
-    def test_extract_metadata_pdf_entries_have_required_keys(
-        self,
-        remote_parser: RemoteDocumentParser,
-        sample_pdf_file: Path,
-    ) -> None:
-        result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
-        for entry in result:
-            assert "namespace" in entry
-            assert "prefix" in entry
-            assert "key" in entry
-            assert "value" in entry
-            assert isinstance(entry["value"], str)
-
-    def test_extract_metadata_does_not_raise_on_invalid_pdf(
-        self,
-        remote_parser: RemoteDocumentParser,
-        tmp_path: Path,
-    ) -> None:
-        bad_pdf = tmp_path / "bad.pdf"
-        bad_pdf.write_bytes(b"not a pdf at all")
-        result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
-        assert result == []
-
-
-# ---------------------------------------------------------------------------
-# Registry integration
-# ---------------------------------------------------------------------------
-
-
-class TestRemoteParserRegistry:
-    def test_registered_in_defaults(self) -> None:
-        from paperless.parsers.registry import ParserRegistry
-
-        registry = ParserRegistry()
-        registry.register_defaults()
-
-        assert RemoteDocumentParser in registry._builtins
-
-    @pytest.mark.usefixtures("azure_settings")
-    def test_get_parser_returns_remote_when_configured(self) -> None:
-        from paperless.parsers.registry import get_parser_registry
-
-        registry = get_parser_registry()
-        parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
-
-        assert parser_cls is RemoteDocumentParser
-
-    @pytest.mark.usefixtures("no_engine_settings")
-    def test_get_parser_returns_none_for_unsupported_type_when_not_configured(
-        self,
-    ) -> None:
-        """With remote off and a truly unsupported MIME type, registry returns None."""
-        from paperless.parsers.registry import ParserRegistry
-
-        registry = ParserRegistry()
-        registry.register_defaults()
-        parser_cls = registry.get_parser_for_file(
-            "application/x-unknown-format",
-            "doc.xyz",
-        )
-
-        assert parser_cls is None
--- a/src/paperless/tests/parsers/test_tesseract_parser.py
+++ b/src/paperless/tests/parsers/test_tesseract_parser.py
--- a/src/paperless/tests/parsers/test_text_parser.py
+++ b/src/paperless/tests/parsers/test_text_parser.py
@@ -12,7 +12,6 @@ from pathlib import Path

 import pytest

-from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.text import TextDocumentParser

@@ -94,7 +93,6 @@ class TestTextParserParse:
        text_parser: TextDocumentParser,
        sample_txt_file: Path,
    ) -> None:
-        text_parser.configure(ParserContext())
        text_parser.parse(sample_txt_file, "text/plain")

        assert text_parser.get_text() == "This is a test file.\n"
@@ -104,7 +102,6 @@ class TestTextParserParse:
        text_parser: TextDocumentParser,
        sample_txt_file: Path,
    ) -> None:
-        text_parser.configure(ParserContext())
        text_parser.parse(sample_txt_file, "text/plain")

        assert text_parser.get_archive_path() is None
@@ -114,7 +111,6 @@ class TestTextParserParse:
        text_parser: TextDocumentParser,
        sample_txt_file: Path,
    ) -> None:
-        text_parser.configure(ParserContext())
        text_parser.parse(sample_txt_file, "text/plain")

        assert text_parser.get_date() is None
@@ -133,7 +129,6 @@ class TestTextParserParse:
            - Parsing succeeds
            - Invalid bytes are replaced with the Unicode replacement character
        """
-        text_parser.configure(ParserContext())
        text_parser.parse(malformed_txt_file, "text/plain")

        assert text_parser.get_text() == "Pantothens\ufffdure\n"
@@ -256,9 +251,6 @@ class TestTextParserRegistry:
        from paperless.parsers.registry import get_parser_registry

        registry = get_parser_registry()
-        parser_cls = registry.get_parser_for_file(
-            "application/x-unknown-format",
-            "doc.xyz",
-        )
+        parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")

        assert parser_cls is None
--- a/src/paperless/tests/parsers/test_tika_parser.py
+++ b/src/paperless/tests/parsers/test_tika_parser.py
@@ -9,7 +9,6 @@ from pytest_django.fixtures import SettingsWrapper
 from pytest_httpx import HTTPXMock

 from documents.parsers import ParseError
-from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.tika import TikaDocumentParser

@@ -61,29 +60,6 @@ class TestTikaParserRegistryInterface:
    def test_requires_pdf_rendition_is_true(self) -> None:
        assert TikaDocumentParser().requires_pdf_rendition is True

-    def test_get_page_count_returns_none_without_archive(
-        self,
-        tika_parser: TikaDocumentParser,
-        sample_odt_file: Path,
-    ) -> None:
-        assert (
-            tika_parser.get_page_count(
-                sample_odt_file,
-                "application/vnd.oasis.opendocument.text",
-            )
-            is None
-        )
-
-    def test_get_page_count_returns_int_with_pdf_archive(
-        self,
-        tika_parser: TikaDocumentParser,
-        sample_pdf_file: Path,
-    ) -> None:
-        tika_parser._archive_path = sample_pdf_file
-        count = tika_parser.get_page_count(sample_pdf_file, "application/pdf")
-        assert isinstance(count, int)
-        assert count > 0
-

@pytest.mark.django_db()
 class TestTikaParser:
@@ -107,7 +83,6 @@ class TestTikaParser:
        # Pretend convert to PDF response
        httpx_mock.add_response(content=b"PDF document")

-        tika_parser.configure(ParserContext())
        tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")

        assert tika_parser.get_text() == "the content"
--- a/src/paperless/tests/test_registry.py
+++ b/src/paperless/tests/test_registry.py
@@ -18,7 +18,6 @@ from unittest.mock import patch

 import pytest

-from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.registry import ParserRegistry
 from paperless.parsers.registry import get_parser_registry
@@ -104,11 +103,6 @@ def dummy_parser_cls() -> type:
        ) -> list:
            return []

-        def configure(self, context: ParserContext) -> None:
-            """
-            Required to exist, but doesn't need to do anything
-            """
-
        def __enter__(self) -> Self:
            return self

@@ -150,7 +144,6 @@ class TestParserProtocol:
    @pytest.mark.parametrize(
        "missing_method",
        [
-            pytest.param("configure", id="missing-configure"),
            pytest.param("parse", id="missing-parse"),
            pytest.param("get_text", id="missing-get_text"),
            pytest.param("get_thumbnail", id="missing-get_thumbnail"),
--- a/src/paperless_ai/tests/conftest.py
+++ b/src/paperless_ai/tests/conftest.py
@@ -1,10 +0,0 @@
-from pathlib import Path
-
-import pytest
-from pytest_django.fixtures import SettingsWrapper
-
-
-@pytest.fixture
-def temp_llm_index_dir(tmp_path: Path, settings: SettingsWrapper):
-    settings.LLM_INDEX_DIR = tmp_path
-    return tmp_path
--- a/src/paperless_ai/tests/test_ai_indexing.py
+++ b/src/paperless_ai/tests/test_ai_indexing.py
@@ -13,6 +13,14 @@ from documents.models import PaperlessTask
 from paperless_ai import indexing


+@pytest.fixture
+def temp_llm_index_dir(tmp_path):
+    original_dir = indexing.settings.LLM_INDEX_DIR
+    indexing.settings.LLM_INDEX_DIR = tmp_path
+    yield tmp_path
+    indexing.settings.LLM_INDEX_DIR = original_dir
+
+
@pytest.fixture
 def real_document(db):
    return Document.objects.create(
--- a/src/paperless_ai/tests/test_embedding.py
+++ b/src/paperless_ai/tests/test_embedding.py
@@ -3,6 +3,7 @@ from unittest.mock import MagicMock
 from unittest.mock import patch

 import pytest
+from django.conf import settings

 from documents.models import Document
 from paperless.models import LLMEmbeddingBackend
@@ -18,6 +19,14 @@ def mock_ai_config():
        yield MockAIConfig


+@pytest.fixture
+def temp_llm_index_dir(tmp_path):
+    original_dir = settings.LLM_INDEX_DIR
+    settings.LLM_INDEX_DIR = tmp_path
+    yield tmp_path
+    settings.LLM_INDEX_DIR = original_dir
+
+
@pytest.fixture
 def mock_document():
    doc = MagicMock(spec=Document)
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -0,0 +1,481 @@
+import re
+from html import escape
+from pathlib import Path
+
+from bleach import clean
+from bleach import linkify
+from django.conf import settings
+from django.utils import timezone
+from django.utils.timezone import is_naive
+from django.utils.timezone import make_aware
+from gotenberg_client import GotenbergClient
+from gotenberg_client.constants import A4
+from gotenberg_client.options import Measurement
+from gotenberg_client.options import MeasurementUnitType
+from gotenberg_client.options import PageMarginsType
+from gotenberg_client.options import PdfAFormat
+from humanize import naturalsize
+from imap_tools import MailAttachment
+from imap_tools import MailMessage
+from tika_client import TikaClient
+
+from documents.parsers import DocumentParser
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from paperless.models import OutputTypeChoices
+from paperless_mail.models import MailRule
+
+
+class MailDocumentParser(DocumentParser):
+    """
+    This parser uses imap_tools to parse .eml files, generates pdf using
+    Gotenberg and sends the html part to a Tika server for text extraction.
+    """
+
+    logging_name = "paperless.parsing.mail"
+
+    def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
+        """
+        Converts our requested PDF/A output into the Gotenberg API
+        format
+        """
+        if settings.OCR_OUTPUT_TYPE in {
+            OutputTypeChoices.PDF_A,
+            OutputTypeChoices.PDF_A2,
+        }:
+            return PdfAFormat.A2b
+        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:  # pragma: no cover
+            self.log.warning(
+                "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
+            )
+            return PdfAFormat.A2b
+        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:  # pragma: no cover
+            return PdfAFormat.A3b
+        return None
+
+    def get_thumbnail(
+        self,
+        document_path: Path,
+        mime_type: str,
+        file_name=None,
+    ) -> Path:
+        if not self.archive_path:
+            self.archive_path = self.generate_pdf(
+                self.parse_file_to_message(document_path),
+            )
+
+        return make_thumbnail_from_pdf(
+            self.archive_path,
+            self.tempdir,
+            self.logging_group,
+        )
+
+    def extract_metadata(self, document_path: Path, mime_type: str):
+        result = []
+
+        try:
+            mail = self.parse_file_to_message(document_path)
+        except ParseError as e:
+            self.log.warning(
+                f"Error while fetching document metadata for {document_path}: {e}",
+            )
+            return result
+
+        for key, value in mail.headers.items():
+            value = ", ".join(i for i in value)
+            try:
+                value.encode("utf-8")
+            except UnicodeEncodeError as e:  # pragma: no cover
+                self.log.debug(f"Skipping header {key}: {e}")
+                continue
+
+            result.append(
+                {
+                    "namespace": "",
+                    "prefix": "header",
+                    "key": key,
+                    "value": value,
+                },
+            )
+
+        result.append(
+            {
+                "namespace": "",
+                "prefix": "",
+                "key": "attachments",
+                "value": ", ".join(
+                    f"{attachment.filename}"
+                    f"({naturalsize(attachment.size, binary=True, format='%.2f')})"
+                    for attachment in mail.attachments
+                ),
+            },
+        )
+
+        result.append(
+            {
+                "namespace": "",
+                "prefix": "",
+                "key": "date",
+                "value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
+            },
+        )
+
+        result.sort(key=lambda item: (item["prefix"], item["key"]))
+        return result
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        file_name=None,
+        mailrule_id: int | None = None,
+    ) -> None:
+        """
+        Parses the given .eml into formatted text, based on the decoded email.
+
+        """
+
+        def strip_text(text: str):
+            """
+            Reduces the spacing of the given text string
+            """
+            text = re.sub(r"\s+", " ", text)
+            text = re.sub(r"(\n *)+", "\n", text)
+            return text.strip()
+
+        def build_formatted_text(mail_message: MailMessage) -> str:
+            """
+            Constructs a formatted string, based on the given email.  Basically tries
+            to get most of the email content, included front matter, into a nice string
+            """
+            fmt_text = f"Subject: {mail_message.subject}\n\n"
+            fmt_text += f"From: {mail_message.from_values.full}\n\n"
+            to_list = [address.full for address in mail_message.to_values]
+            fmt_text += f"To: {', '.join(to_list)}\n\n"
+            if mail_message.cc_values:
+                fmt_text += (
+                    f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
+                )
+            if mail_message.bcc_values:
+                fmt_text += (
+                    f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
+                )
+            if mail_message.attachments:
+                att = []
+                for a in mail.attachments:
+                    attachment_size = naturalsize(a.size, binary=True, format="%.2f")
+                    att.append(
+                        f"{a.filename} ({attachment_size})",
+                    )
+                fmt_text += f"Attachments: {', '.join(att)}\n\n"
+
+            if mail.html:
+                fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
+
+            fmt_text += f"\n\n{strip_text(mail.text)}"
+
+            return fmt_text
+
+        self.log.debug(f"Parsing file {document_path.name} into an email")
+        mail = self.parse_file_to_message(document_path)
+
+        self.log.debug("Building formatted text from email")
+        self.text = build_formatted_text(mail)
+
+        if is_naive(mail.date):
+            self.date = make_aware(mail.date)
+        else:
+            self.date = mail.date
+
+        self.log.debug("Creating a PDF from the email")
+        if mailrule_id:
+            rule = MailRule.objects.get(pk=mailrule_id)
+            self.archive_path = self.generate_pdf(mail, rule.pdf_layout)
+        else:
+            self.archive_path = self.generate_pdf(mail)
+
+    @staticmethod
+    def parse_file_to_message(filepath: Path) -> MailMessage:
+        """
+        Parses the given .eml file into a MailMessage object
+        """
+        try:
+            with filepath.open("rb") as eml:
+                parsed = MailMessage.from_bytes(eml.read())
+                if parsed.from_values is None:
+                    raise ParseError(
+                        f"Could not parse {filepath}: Missing 'from'",
+                    )
+        except Exception as err:
+            raise ParseError(
+                f"Could not parse {filepath}: {err}",
+            ) from err
+
+        return parsed
+
+    def tika_parse(self, html: str):
+        self.log.info("Sending content to Tika server")
+
+        try:
+            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
+                parsed = client.tika.as_text.from_buffer(html, "text/html")
+
+                if parsed.content is not None:
+                    return parsed.content.strip()
+                return ""
+        except Exception as err:
+            raise ParseError(
+                f"Could not parse content with tika server at "
+                f"{settings.TIKA_ENDPOINT}: {err}",
+            ) from err
+
+    def generate_pdf(
+        self,
+        mail_message: MailMessage,
+        pdf_layout: MailRule.PdfLayout | None = None,
+    ) -> Path:
+        archive_path = Path(self.tempdir) / "merged.pdf"
+
+        mail_pdf_file = self.generate_pdf_from_mail(mail_message)
+
+        pdf_layout = (
+            pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
+        )  # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout
+
+        # If no HTML content, create the PDF from the message
+        # Otherwise, create 2 PDFs and merge them with Gotenberg
+        if not mail_message.html:
+            archive_path.write_bytes(mail_pdf_file.read_bytes())
+        else:
+            pdf_of_html_content = self.generate_pdf_from_html(
+                mail_message.html,
+                mail_message.attachments,
+            )
+
+            self.log.debug("Merging email text and HTML content into single PDF")
+
+            with (
+                GotenbergClient(
+                    host=settings.TIKA_GOTENBERG_ENDPOINT,
+                    timeout=settings.CELERY_TASK_TIME_LIMIT,
+                ) as client,
+                client.merge.merge() as route,
+            ):
+                # Configure requested PDF/A formatting, if any
+                pdf_a_format = self._settings_to_gotenberg_pdfa()
+                if pdf_a_format is not None:
+                    route.pdf_format(pdf_a_format)
+
+                match pdf_layout:
+                    case MailRule.PdfLayout.HTML_TEXT:
+                        route.merge([pdf_of_html_content, mail_pdf_file])
+                    case MailRule.PdfLayout.HTML_ONLY:
+                        route.merge([pdf_of_html_content])
+                    case MailRule.PdfLayout.TEXT_ONLY:
+                        route.merge([mail_pdf_file])
+                    case MailRule.PdfLayout.TEXT_HTML | _:
+                        route.merge([mail_pdf_file, pdf_of_html_content])
+
+                try:
+                    response = route.run()
+                    archive_path.write_bytes(response.content)
+                except Exception as err:
+                    raise ParseError(
+                        f"Error while merging email HTML into PDF: {err}",
+                    ) from err
+
+        return archive_path
+
+    def mail_to_html(self, mail: MailMessage) -> Path:
+        """
+        Converts the given email into an HTML file, formatted
+        based on the given template
+        """
+
+        def clean_html(text: str) -> str:
+            """
+            Attempts to clean, escape and linkify the given HTML string
+            """
+            if isinstance(text, list):
+                text = "\n".join([str(e) for e in text])
+            if not isinstance(text, str):
+                text = str(text)
+            text = escape(text)
+            text = clean(text)
+            text = linkify(text, parse_email=True)
+            text = text.replace("\n", "<br>")
+            return text
+
+        data = {}
+
+        data["subject"] = clean_html(mail.subject)
+        if data["subject"]:
+            data["subject_label"] = "Subject"
+        data["from"] = clean_html(mail.from_values.full)
+        if data["from"]:
+            data["from_label"] = "From"
+        data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
+        if data["to"]:
+            data["to_label"] = "To"
+        data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
+        if data["cc"]:
+            data["cc_label"] = "CC"
+        data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
+        if data["bcc"]:
+            data["bcc_label"] = "BCC"
+
+        att = []
+        for a in mail.attachments:
+            att.append(
+                f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
+            )
+        data["attachments"] = clean_html(", ".join(att))
+        if data["attachments"]:
+            data["attachments_label"] = "Attachments"
+
+        data["date"] = clean_html(
+            timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
+        )
+        data["content"] = clean_html(mail.text.strip())
+
+        from django.template.loader import render_to_string
+
+        html_file = Path(self.tempdir) / "email_as_html.html"
+        html_file.write_text(render_to_string("email_msg_template.html", context=data))
+
+        return html_file
+
+    def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
+        """
+        Creates a PDF based on the given email, using the email's values in a
+        an HTML template
+        """
+        self.log.info("Converting mail to PDF")
+
+        css_file = Path(__file__).parent / "templates" / "output.css"
+        email_html_file = self.mail_to_html(mail)
+
+        with (
+            GotenbergClient(
+                host=settings.TIKA_GOTENBERG_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ) as client,
+            client.chromium.html_to_pdf() as route,
+        ):
+            # Configure requested PDF/A formatting, if any
+            pdf_a_format = self._settings_to_gotenberg_pdfa()
+            if pdf_a_format is not None:
+                route.pdf_format(pdf_a_format)
+
+            try:
+                response = (
+                    route.index(email_html_file)
+                    .resource(css_file)
+                    .margins(
+                        PageMarginsType(
+                            top=Measurement(0.1, MeasurementUnitType.Inches),
+                            bottom=Measurement(0.1, MeasurementUnitType.Inches),
+                            left=Measurement(0.1, MeasurementUnitType.Inches),
+                            right=Measurement(0.1, MeasurementUnitType.Inches),
+                        ),
+                    )
+                    .size(A4)
+                    .scale(1.0)
+                    .run()
+                )
+            except Exception as err:
+                raise ParseError(
+                    f"Error while converting email to PDF: {err}",
+                ) from err
+
+        email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
+        email_as_pdf_file.write_bytes(response.content)
+
+        return email_as_pdf_file
+
+    def generate_pdf_from_html(
+        self,
+        orig_html: str,
+        attachments: list[MailAttachment],
+    ) -> Path:
+        """
+        Generates a PDF file based on the HTML and attachments of the email
+        """
+
+        def clean_html_script(text: str):
+            compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
+            text = compiled_open.sub("<div hidden ", text)
+
+            compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
+            text = compiled_close.sub("</div", text)
+            return text
+
+        self.log.info("Converting message html to PDF")
+
+        tempdir = Path(self.tempdir)
+
+        html_clean = clean_html_script(orig_html)
+        html_clean_file = tempdir / "index.html"
+        html_clean_file.write_text(html_clean)
+
+        with (
+            GotenbergClient(
+                host=settings.TIKA_GOTENBERG_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ) as client,
+            client.chromium.html_to_pdf() as route,
+        ):
+            # Configure requested PDF/A formatting, if any
+            pdf_a_format = self._settings_to_gotenberg_pdfa()
+            if pdf_a_format is not None:
+                route.pdf_format(pdf_a_format)
+
+            # Add attachments as resources, cleaning the filename and replacing
+            # it in the index file for inclusion
+            for attachment in attachments:
+                # Clean the attachment name to be valid
+                name_cid = f"cid:{attachment.content_id}"
+                name_clean = "".join(e for e in name_cid if e.isalnum())
+
+                # Write attachment payload to a temp file
+                temp_file = tempdir / name_clean
+                temp_file.write_bytes(attachment.payload)
+
+                route.resource(temp_file)
+
+                # Replace as needed the name with the clean name
+                html_clean = html_clean.replace(name_cid, name_clean)
+
+            # Now store the cleaned up HTML version
+            html_clean_file = tempdir / "index.html"
+            html_clean_file.write_text(html_clean)
+            # This is our index file, the main page basically
+            route.index(html_clean_file)
+
+            # Set page size, margins
+            route.margins(
+                PageMarginsType(
+                    top=Measurement(0.1, MeasurementUnitType.Inches),
+                    bottom=Measurement(0.1, MeasurementUnitType.Inches),
+                    left=Measurement(0.1, MeasurementUnitType.Inches),
+                    right=Measurement(0.1, MeasurementUnitType.Inches),
+                ),
+            ).size(A4).scale(1.0)
+
+            try:
+                response = route.run()
+
+            except Exception as err:
+                raise ParseError(
+                    f"Error while converting document to PDF: {err}",
+                ) from err
+
+        html_pdf = tempdir / "html.pdf"
+        html_pdf.write_bytes(response.content)
+        return html_pdf
+
+    def get_settings(self) -> None:
+        """
+        This parser does not implement additional settings yet
+        """
+        return None
--- a/src/paperless_mail/signals.py
+++ b/src/paperless_mail/signals.py
@@ -1,12 +1,7 @@
 def get_parser(*args, **kwargs):
-    from paperless.parsers.mail import MailDocumentParser
+    from paperless_mail.parsers import MailDocumentParser

-    # MailDocumentParser accepts no constructor args in the new-style protocol.
-    # Pop legacy args that arrive from the signal-based consumer path.
-    # Phase 4 will replace this signal path with the ParserRegistry.
-    kwargs.pop("logging_group", None)
-    kwargs.pop("progress_callback", None)
-    return MailDocumentParser()
+    return MailDocumentParser(*args, **kwargs)


 def mail_consumer_declaration(sender, **kwargs):
--- a/src/paperless_mail/tests/conftest.py
+++ b/src/paperless_mail/tests/conftest.py
@@ -1,9 +1,71 @@
 from collections.abc import Generator
+from pathlib import Path

 import pytest

 from paperless_mail.mail import MailAccountHandler
 from paperless_mail.models import MailAccount
+from paperless_mail.parsers import MailDocumentParser
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+    return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture(scope="session")
+def broken_email_file(sample_dir: Path) -> Path:
+    return sample_dir / "broken.eml"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_file(sample_dir: Path) -> Path:
+    return sample_dir / "simple_text.eml"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
+    return sample_dir / "simple_text.eml.pdf"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
+    return sample_dir / "simple_text.eml.pdf.webp"
+
+
+@pytest.fixture(scope="session")
+def html_email_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml"
+
+
+@pytest.fixture(scope="session")
+def html_email_pdf_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml.pdf"
+
+
+@pytest.fixture(scope="session")
+def html_email_thumbnail_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml.pdf.webp"
+
+
+@pytest.fixture(scope="session")
+def html_email_html_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml.html"
+
+
+@pytest.fixture(scope="session")
+def merged_pdf_first(sample_dir: Path) -> Path:
+    return sample_dir / "first.pdf"
+
+
+@pytest.fixture(scope="session")
+def merged_pdf_second(sample_dir: Path) -> Path:
+    return sample_dir / "second.pdf"
+
+
+@pytest.fixture()
+def mail_parser() -> MailDocumentParser:
+    return MailDocumentParser(logging_group=None)


@pytest.fixture()
@@ -27,3 +89,11 @@ def greenmail_mail_account(db: None) -> Generator[MailAccount, None, None]:
@pytest.fixture()
 def mail_account_handler() -> MailAccountHandler:
    return MailAccountHandler()
+
+
+@pytest.fixture(scope="session")
+def nginx_base_url() -> Generator[str, None, None]:
+    """
+    The base URL for the nginx HTTP server we expect to be alive
+    """
+    yield "http://localhost:8080"
--- a/src/paperless_mail/tests/samples/broken.eml
+++ b/src/paperless_mail/tests/samples/broken.eml
--- a/src/paperless_mail/tests/samples/first.pdf
+++ b/src/paperless_mail/tests/samples/first.pdf
--- a/src/paperless_mail/tests/samples/html.eml
+++ b/src/paperless_mail/tests/samples/html.eml
--- a/src/paperless_mail/tests/samples/html.eml.html
+++ b/src/paperless_mail/tests/samples/html.eml.html
--- a/src/paperless_mail/tests/samples/html.eml.pdf
+++ b/src/paperless_mail/tests/samples/html.eml.pdf
--- a/src/paperless_mail/tests/samples/html.eml.pdf.webp
+++ b/src/paperless_mail/tests/samples/html.eml.pdf.webp
--- a/src/paperless_mail/tests/samples/sample.html
+++ b/src/paperless_mail/tests/samples/sample.html
--- a/src/paperless_mail/tests/samples/sample.html.pdf
+++ b/src/paperless_mail/tests/samples/sample.html.pdf
--- a/src/paperless_mail/tests/samples/sample.html.pdf.webp
+++ b/src/paperless_mail/tests/samples/sample.html.pdf.webp
--- a/src/paperless_mail/tests/samples/sample.png
+++ b/src/paperless_mail/tests/samples/sample.png
--- a/src/paperless_mail/tests/samples/second.pdf
+++ b/src/paperless_mail/tests/samples/second.pdf
--- a/src/paperless_mail/tests/samples/simple_text.eml
+++ b/src/paperless_mail/tests/samples/simple_text.eml
--- a/src/paperless_mail/tests/samples/simple_text.eml.pdf
+++ b/src/paperless_mail/tests/samples/simple_text.eml.pdf
--- a/src/paperless_mail/tests/samples/simple_text.eml.pdf.webp
+++ b/src/paperless_mail/tests/samples/simple_text.eml.pdf.webp
--- a/src/paperless_mail/tests/test_mail_oauth.py
+++ b/src/paperless_mail/tests/test_mail_oauth.py
@@ -1,6 +1,7 @@
 from datetime import timedelta
 from unittest import mock

+from django.conf import settings
 from django.contrib.auth.models import Permission
 from django.contrib.auth.models import User
 from django.test import TestCase
@@ -15,13 +16,6 @@ from paperless_mail.models import MailAccount
 from paperless_mail.oauth import PaperlessMailOAuth2Manager


-@override_settings(
-    OAUTH_CALLBACK_BASE_URL="http://localhost:8000",
-    GMAIL_OAUTH_CLIENT_ID="test_gmail_client_id",
-    GMAIL_OAUTH_CLIENT_SECRET="test_gmail_client_secret",
-    OUTLOOK_OAUTH_CLIENT_ID="test_outlook_client_id",
-    OUTLOOK_OAUTH_CLIENT_SECRET="test_outlook_client_secret",
-)
 class TestMailOAuth(
    TestCase,
 ):
@@ -37,6 +31,12 @@ class TestMailOAuth(
        self.user.save()
        self.client.force_login(self.user)
        self.mail_account_handler = MailAccountHandler()
+        # Mock settings
+        settings.OAUTH_CALLBACK_BASE_URL = "http://localhost:8000"
+        settings.GMAIL_OAUTH_CLIENT_ID = "test_gmail_client_id"
+        settings.GMAIL_OAUTH_CLIENT_SECRET = "test_gmail_client_secret"
+        settings.OUTLOOK_OAUTH_CLIENT_ID = "test_outlook_client_id"
+        settings.OUTLOOK_OAUTH_CLIENT_SECRET = "test_outlook_client_secret"
        super().setUp()

    def test_generate_paths(self) -> None:
--- a/src/paperless/tests/parsers/test_mail_parser.py
+++ b/src/paperless/tests/parsers/test_mail_parser.py
@@ -12,64 +12,7 @@ from pytest_httpx import HTTPXMock
 from pytest_mock import MockerFixture

 from documents.parsers import ParseError
-from paperless.parsers import ParserContext
-from paperless.parsers import ParserProtocol
-from paperless.parsers.mail import MailDocumentParser
-
-
-class TestMailParserProtocol:
-    """Verify that MailDocumentParser satisfies the ParserProtocol contract."""
-
-    def test_isinstance_satisfies_protocol(
-        self,
-        mail_parser: MailDocumentParser,
-    ) -> None:
-        assert isinstance(mail_parser, ParserProtocol)
-
-    def test_supported_mime_types(self) -> None:
-        mime_types = MailDocumentParser.supported_mime_types()
-        assert isinstance(mime_types, dict)
-        assert "message/rfc822" in mime_types
-
-    @pytest.mark.parametrize(
-        ("mime_type", "expected"),
-        [
-            ("message/rfc822", 10),
-            ("application/pdf", None),
-            ("text/plain", None),
-        ],
-    )
-    def test_score(self, mime_type: str, expected: int | None) -> None:
-        assert MailDocumentParser.score(mime_type, "email.eml") == expected
-
-    def test_can_produce_archive_is_false(
-        self,
-        mail_parser: MailDocumentParser,
-    ) -> None:
-        assert mail_parser.can_produce_archive is False
-
-    def test_requires_pdf_rendition_is_true(
-        self,
-        mail_parser: MailDocumentParser,
-    ) -> None:
-        assert mail_parser.requires_pdf_rendition is True
-
-    def test_get_page_count_returns_none_without_archive(
-        self,
-        mail_parser: MailDocumentParser,
-        html_email_file: Path,
-    ) -> None:
-        assert mail_parser.get_page_count(html_email_file, "message/rfc822") is None
-
-    def test_get_page_count_returns_int_with_pdf_archive(
-        self,
-        mail_parser: MailDocumentParser,
-        simple_txt_email_pdf_file: Path,
-    ) -> None:
-        mail_parser._archive_path = simple_txt_email_pdf_file
-        count = mail_parser.get_page_count(simple_txt_email_pdf_file, "message/rfc822")
-        assert isinstance(count, int)
-        assert count > 0
+from paperless_mail.parsers import MailDocumentParser


 class TestEmailFileParsing:
@@ -81,7 +24,7 @@ class TestEmailFileParsing:
    def test_parse_error_missing_file(
        self,
        mail_parser: MailDocumentParser,
-        mail_samples_dir: Path,
+        sample_dir: Path,
    ) -> None:
        """
        GIVEN:
@@ -92,7 +35,7 @@ class TestEmailFileParsing:
            - An Exception is thrown
        """
        # Check if exception is raised when parsing fails.
-        test_file = mail_samples_dir / "doesntexist.eml"
+        test_file = sample_dir / "doesntexist.eml"

        assert not test_file.exists()

@@ -303,12 +246,12 @@ class TestEmailThumbnailGenerate:
        """
        mocked_return = "Passing the return value through.."
        mock_make_thumbnail_from_pdf = mocker.patch(
-            "paperless.parsers.mail.make_thumbnail_from_pdf",
+            "paperless_mail.parsers.make_thumbnail_from_pdf",
        )
        mock_make_thumbnail_from_pdf.return_value = mocked_return

        mock_generate_pdf = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
        )
        mock_generate_pdf.return_value = "Mocked return value.."

@@ -317,7 +260,8 @@ class TestEmailThumbnailGenerate:
        mock_generate_pdf.assert_called_once()
        mock_make_thumbnail_from_pdf.assert_called_once_with(
            "Mocked return value..",
-            mail_parser._tempdir,
+            mail_parser.tempdir,
+            None,
        )

        assert mocked_return == thumb
@@ -429,7 +373,7 @@ class TestParser:
        """
        # Validate parsing returns the expected results
        mock_generate_pdf = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
        )

        mail_parser.parse(simple_txt_email_file, "message/rfc822")
@@ -441,7 +385,7 @@ class TestParser:
            "BCC: fdf@fvf.de\n\n"
            "\n\nThis is just a simple Text Mail."
        )
-        assert text_expected == mail_parser.get_text()
+        assert text_expected == mail_parser.text
        assert (
            datetime.datetime(
                2022,
@@ -452,7 +396,7 @@ class TestParser:
                43,
                tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
            )
-            == mail_parser.get_date()
+            == mail_parser.date
        )

        # Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
@@ -475,7 +419,7 @@ class TestParser:
        """

        mock_generate_pdf = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
        )

        # Validate parsing returns the expected results
@@ -499,7 +443,7 @@ class TestParser:
        mail_parser.parse(html_email_file, "message/rfc822")

        mock_generate_pdf.assert_called_once()
-        assert text_expected == mail_parser.get_text()
+        assert text_expected == mail_parser.text
        assert (
            datetime.datetime(
                2022,
@@ -510,7 +454,7 @@ class TestParser:
                19,
                tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
            )
-            == mail_parser.get_date()
+            == mail_parser.date
        )

    def test_generate_pdf_parse_error(
@@ -557,7 +501,7 @@ class TestParser:

        mail_parser.parse(simple_txt_email_file, "message/rfc822")

-        assert mail_parser.get_archive_path() is not None
+        assert mail_parser.archive_path is not None

    @pytest.mark.httpx_mock(can_send_already_matched_responses=True)
    def test_generate_pdf_html_email(
@@ -598,7 +542,7 @@ class TestParser:
        )
        mail_parser.parse(html_email_file, "message/rfc822")

-        assert mail_parser.get_archive_path() is not None
+        assert mail_parser.archive_path is not None

    def test_generate_pdf_html_email_html_to_pdf_failure(
        self,
@@ -768,10 +712,10 @@ class TestParser:

        def test_layout_option(layout_option, expected_calls, expected_pdf_names):
            mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
-            mail_parser.configure(ParserContext(mailrule_id=1))
            mail_parser.parse(
                document_path=html_email_file,
                mime_type="message/rfc822",
+                mailrule_id=1,
            )
            args, _ = mock_merge_route.call_args
            assert len(args[0]) == expected_calls
--- a/src/paperless/tests/parsers/test_mail_parser_live.py
+++ b/src/paperless/tests/parsers/test_mail_parser_live.py
@@ -11,7 +11,7 @@ from PIL import Image
 from pytest_mock import MockerFixture

 from documents.tests.utils import util_call_with_backoff
-from paperless.parsers.mail import MailDocumentParser
+from paperless_mail.parsers import MailDocumentParser


 def extract_text(pdf_path: Path) -> str:
@@ -159,7 +159,7 @@ class TestParserLive:
            - The returned thumbnail image file shall match the expected hash
        """
        mock_generate_pdf = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
        )
        mock_generate_pdf.return_value = simple_txt_email_pdf_file

@@ -216,10 +216,10 @@ class TestParserLive:
            - The merged PDF shall contain text from both source PDFs
        """
        mock_generate_pdf_from_html = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf_from_html",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
        )
        mock_generate_pdf_from_mail = mocker.patch(
-            "paperless.parsers.mail.MailDocumentParser.generate_pdf_from_mail",
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
        )
        mock_generate_pdf_from_mail.return_value = merged_pdf_first
        mock_generate_pdf_from_html.return_value = merged_pdf_second
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@@ -0,0 +1,118 @@
+from pathlib import Path
+
+from django.conf import settings
+
+from paperless_tesseract.parsers import RasterisedDocumentParser
+
+
+class RemoteEngineConfig:
+    def __init__(
+        self,
+        engine: str,
+        api_key: str | None = None,
+        endpoint: str | None = None,
+    ):
+        self.engine = engine
+        self.api_key = api_key
+        self.endpoint = endpoint
+
+    def engine_is_valid(self):
+        valid = self.engine in ["azureai"] and self.api_key is not None
+        if self.engine == "azureai":
+            valid = valid and self.endpoint is not None
+        return valid
+
+
+class RemoteDocumentParser(RasterisedDocumentParser):
+    """
+    This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
+    as this is the only service that provides a remote OCR API with text-embedded PDF output.
+    """
+
+    logging_name = "paperless.parsing.remote"
+
+    def get_settings(self) -> RemoteEngineConfig:
+        """
+        Returns the configuration for the remote OCR engine, loaded from Django settings.
+        """
+        return RemoteEngineConfig(
+            engine=settings.REMOTE_OCR_ENGINE,
+            api_key=settings.REMOTE_OCR_API_KEY,
+            endpoint=settings.REMOTE_OCR_ENDPOINT,
+        )
+
+    def supported_mime_types(self):
+        if self.settings.engine_is_valid():
+            return {
+                "application/pdf": ".pdf",
+                "image/png": ".png",
+                "image/jpeg": ".jpg",
+                "image/tiff": ".tiff",
+                "image/bmp": ".bmp",
+                "image/gif": ".gif",
+                "image/webp": ".webp",
+            }
+        else:
+            return {}
+
+    def azure_ai_vision_parse(
+        self,
+        file: Path,
+    ) -> str | None:
+        """
+        Uses Azure AI Vision to parse the document and return the text content.
+        It requests a searchable PDF output with embedded text.
+        The PDF is saved to the archive_path attribute.
+        Returns the text content extracted from the document.
+        If the parsing fails, it returns None.
+        """
+        from azure.ai.documentintelligence import DocumentIntelligenceClient
+        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+        from azure.ai.documentintelligence.models import AnalyzeOutputOption
+        from azure.ai.documentintelligence.models import DocumentContentFormat
+        from azure.core.credentials import AzureKeyCredential
+
+        client = DocumentIntelligenceClient(
+            endpoint=self.settings.endpoint,
+            credential=AzureKeyCredential(self.settings.api_key),
+        )
+
+        try:
+            with file.open("rb") as f:
+                analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
+                poller = client.begin_analyze_document(
+                    model_id="prebuilt-read",
+                    body=analyze_request,
+                    output_content_format=DocumentContentFormat.TEXT,
+                    output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
+                    content_type="application/json",
+                )
+
+            poller.wait()
+            result_id = poller.details["operation_id"]
+            result = poller.result()
+
+            # Download the PDF with embedded text
+            self.archive_path = self.tempdir / "archive.pdf"
+            with self.archive_path.open("wb") as f:
+                for chunk in client.get_analyze_result_pdf(
+                    model_id="prebuilt-read",
+                    result_id=result_id,
+                ):
+                    f.write(chunk)
+            return result.content
+        except Exception as e:
+            self.log.error(f"Azure AI Vision parsing failed: {e}")
+        finally:
+            client.close()
+
+        return None
+
+    def parse(self, document_path: Path, mime_type, file_name=None):
+        if not self.settings.engine_is_valid():
+            self.log.warning(
+                "No valid remote parser engine is configured, content will be empty.",
+            )
+            self.text = ""
+        elif self.settings.engine == "azureai":
+            self.text = self.azure_ai_vision_parse(document_path)
--- a/src/paperless_remote/signals.py
+++ b/src/paperless_remote/signals.py
@@ -1,36 +1,16 @@
-from __future__ import annotations
+def get_parser(*args, **kwargs):
+    from paperless_remote.parsers import RemoteDocumentParser

-from typing import Any
-
-
-def get_parser(*args: Any, **kwargs: Any) -> Any:
-    from paperless.parsers.remote import RemoteDocumentParser
-
-    # The new RemoteDocumentParser does not accept the progress_callback
-    # kwarg injected by the old signal-based consumer.  logging_group is
-    # forwarded as a positional arg.
-    # Phase 4 will replace this signal path with the new ParserRegistry.
-    kwargs.pop("progress_callback", None)
    return RemoteDocumentParser(*args, **kwargs)


-def get_supported_mime_types() -> dict[str, str]:
-    from django.conf import settings
+def get_supported_mime_types():
+    from paperless_remote.parsers import RemoteDocumentParser

-    from paperless.parsers.remote import RemoteDocumentParser
-    from paperless.parsers.remote import RemoteEngineConfig
-
-    config = RemoteEngineConfig(
-        engine=settings.REMOTE_OCR_ENGINE,
-        api_key=settings.REMOTE_OCR_API_KEY,
-        endpoint=settings.REMOTE_OCR_ENDPOINT,
-    )
-    if not config.engine_is_valid():
-        return {}
-    return RemoteDocumentParser.supported_mime_types()
+    return RemoteDocumentParser(None).supported_mime_types()


-def remote_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
+def remote_consumer_declaration(sender, **kwargs):
    return {
        "parser": get_parser,
        "weight": 5,
--- a/src/paperless_remote/tests/samples/simple-digital.pdf
+++ b/src/paperless_remote/tests/samples/simple-digital.pdf
--- a/src/paperless_remote/tests/test_parser.py
+++ b/src/paperless_remote/tests/test_parser.py
@@ -0,0 +1,131 @@
+import uuid
+from pathlib import Path
+from unittest import mock
+
+from django.test import TestCase
+from django.test import override_settings
+
+from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import FileSystemAssertsMixin
+from paperless_remote.parsers import RemoteDocumentParser
+from paperless_remote.signals import get_parser
+
+
+class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
+
+    def assertContainsStrings(self, content: str, strings: list[str]) -> None:
+        # Asserts that all strings appear in content, in the given order.
+        indices = []
+        for s in strings:
+            if s in content:
+                indices.append(content.index(s))
+            else:
+                self.fail(f"'{s}' is not in '{content}'")
+        self.assertListEqual(indices, sorted(indices))
+
+    @mock.patch("paperless_tesseract.parsers.run_subprocess")
+    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
+    def test_get_text_with_azure(self, mock_client_cls, mock_subprocess) -> None:
+        # Arrange mock Azure client
+        mock_client = mock.Mock()
+        mock_client_cls.return_value = mock_client
+
+        # Simulate poller result and its `.details`
+        mock_poller = mock.Mock()
+        mock_poller.wait.return_value = None
+        mock_poller.details = {"operation_id": "fake-op-id"}
+        mock_client.begin_analyze_document.return_value = mock_poller
+        mock_poller.result.return_value.content = "This is a test document."
+
+        # Return dummy PDF bytes
+        mock_client.get_analyze_result_pdf.return_value = [
+            b"%PDF-",
+            b"1.7 ",
+            b"FAKEPDF",
+        ]
+
+        # Simulate pdftotext by writing dummy text to sidecar file
+        def fake_run(cmd, *args, **kwargs) -> None:
+            with Path(cmd[-1]).open("w", encoding="utf-8") as f:
+                f.write("This is a test document.")
+
+        mock_subprocess.side_effect = fake_run
+
+        with override_settings(
+            REMOTE_OCR_ENGINE="azureai",
+            REMOTE_OCR_API_KEY="somekey",
+            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
+        ):
+            parser = get_parser(uuid.uuid4())
+            parser.parse(
+                self.SAMPLE_FILES / "simple-digital.pdf",
+                "application/pdf",
+            )
+
+            self.assertContainsStrings(
+                parser.text.strip(),
+                ["This is a test document."],
+            )
+
+    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
+    def test_get_text_with_azure_error_logged_and_returns_none(
+        self,
+        mock_client_cls,
+    ) -> None:
+        mock_client = mock.Mock()
+        mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
+        mock_client_cls.return_value = mock_client
+
+        with override_settings(
+            REMOTE_OCR_ENGINE="azureai",
+            REMOTE_OCR_API_KEY="somekey",
+            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
+        ):
+            parser = get_parser(uuid.uuid4())
+            with mock.patch.object(parser.log, "error") as mock_log_error:
+                parser.parse(
+                    self.SAMPLE_FILES / "simple-digital.pdf",
+                    "application/pdf",
+                )
+
+        self.assertIsNone(parser.text)
+        mock_client.begin_analyze_document.assert_called_once()
+        mock_client.close.assert_called_once()
+        mock_log_error.assert_called_once()
+        self.assertIn(
+            "Azure AI Vision parsing failed",
+            mock_log_error.call_args[0][0],
+        )
+
+    @override_settings(
+        REMOTE_OCR_ENGINE="azureai",
+        REMOTE_OCR_API_KEY="key",
+        REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
+    )
+    def test_supported_mime_types_valid_config(self) -> None:
+        parser = RemoteDocumentParser(uuid.uuid4())
+        expected_types = {
+            "application/pdf": ".pdf",
+            "image/png": ".png",
+            "image/jpeg": ".jpg",
+            "image/tiff": ".tiff",
+            "image/bmp": ".bmp",
+            "image/gif": ".gif",
+            "image/webp": ".webp",
+        }
+        self.assertEqual(parser.supported_mime_types(), expected_types)
+
+    def test_supported_mime_types_invalid_config(self) -> None:
+        parser = get_parser(uuid.uuid4())
+        self.assertEqual(parser.supported_mime_types(), {})
+
+    @override_settings(
+        REMOTE_OCR_ENGINE=None,
+        REMOTE_OCR_API_KEY=None,
+        REMOTE_OCR_ENDPOINT=None,
+    )
+    def test_parse_with_invalid_config(self) -> None:
+        parser = get_parser(uuid.uuid4())
+        parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
+        self.assertEqual(parser.text, "")
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -1,18 +1,13 @@
-from __future__ import annotations
-
-import logging
 import os
 import re
-import shutil
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
-from typing import Any
-from typing import Self

 from django.conf import settings
 from PIL import Image

+from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from documents.utils import maybe_override_pixel_limit
@@ -21,28 +16,6 @@ from paperless.config import OcrConfig
 from paperless.models import ArchiveFileChoices
 from paperless.models import CleanChoices
 from paperless.models import ModeChoices
-from paperless.parsers.utils import read_file_handle_unicode_errors
-from paperless.version import __full_version_str__
-
-if TYPE_CHECKING:
-    import datetime
-    from types import TracebackType
-
-    from paperless.parsers import MetadataEntry
-    from paperless.parsers import ParserContext
-
-logger = logging.getLogger("paperless.parsing.tesseract")
-
-_SUPPORTED_MIME_TYPES: dict[str, str] = {
-    "application/pdf": ".pdf",
-    "image/jpeg": ".jpg",
-    "image/png": ".png",
-    "image/tiff": ".tif",
-    "image/gif": ".gif",
-    "image/bmp": ".bmp",
-    "image/webp": ".webp",
-    "image/heic": ".heic",
-}


 class NoTextFoundException(Exception):
@@ -53,125 +26,81 @@ class RtlLanguageException(Exception):
    pass


-class RasterisedDocumentParser:
+class RasterisedDocumentParser(DocumentParser):
    """
    This parser uses Tesseract to try and get some text out of a rasterised
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """

-    name: str = "Paperless-ngx Tesseract OCR Parser"
-    version: str = __full_version_str__
-    author: str = "Paperless-ngx Contributors"
-    url: str = "https://github.com/paperless-ngx/paperless-ngx"
+    logging_name = "paperless.parsing.tesseract"

-    # ------------------------------------------------------------------
-    # Class methods
-    # ------------------------------------------------------------------
+    def get_settings(self) -> OcrConfig:
+        """
+        This parser uses the OCR configuration settings to parse documents
+        """
+        return OcrConfig()

-    @classmethod
-    def supported_mime_types(cls) -> dict[str, str]:
-        return _SUPPORTED_MIME_TYPES
-
-    @classmethod
-    def score(
-        cls,
-        mime_type: str,
-        filename: str,
-        path: Path | None = None,
-    ) -> int | None:
-        if mime_type in _SUPPORTED_MIME_TYPES:
-            return 10
-        return None
-
-    # ------------------------------------------------------------------
-    # Properties
-    # ------------------------------------------------------------------
-
-    @property
-    def can_produce_archive(self) -> bool:
-        return True
-
-    @property
-    def requires_pdf_rendition(self) -> bool:
-        return False
-
-    # ------------------------------------------------------------------
-    # Lifecycle
-    # ------------------------------------------------------------------
-
-    def __init__(self, logging_group: object = None) -> None:
-        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
-        self.tempdir = Path(
-            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
-        )
-        self.settings = OcrConfig()
-        self.archive_path: Path | None = None
-        self.text: str | None = None
-        self.date: datetime.datetime | None = None
-        self.log = logger
-
-    def __enter__(self) -> Self:
-        return self
-
-    def __exit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc_val: BaseException | None,
-        exc_tb: TracebackType | None,
-    ) -> None:
-        logger.debug("Cleaning up temporary directory %s", self.tempdir)
-        shutil.rmtree(self.tempdir, ignore_errors=True)
-
-    # ------------------------------------------------------------------
-    # Core parsing interface
-    # ------------------------------------------------------------------
-
-    def configure(self, context: ParserContext) -> None:
-        pass
-
-    # ------------------------------------------------------------------
-    # Result accessors
-    # ------------------------------------------------------------------
-
-    def get_text(self) -> str | None:
-        return self.text
-
-    def get_date(self) -> datetime.datetime | None:
-        return self.date
-
-    def get_archive_path(self) -> Path | None:
-        return self.archive_path
-
-    # ------------------------------------------------------------------
-    # Thumbnail, page count, and metadata
-    # ------------------------------------------------------------------
-
-    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
-        return make_thumbnail_from_pdf(
-            self.archive_path or Path(document_path),
-            self.tempdir,
-        )
-
-    def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
+    def get_page_count(self, document_path, mime_type):
+        page_count = None
        if mime_type == "application/pdf":
-            from paperless.parsers.utils import get_page_count_for_pdf
+            try:
+                import pikepdf

-            return get_page_count_for_pdf(Path(document_path), log=self.log)
-        return None
+                with pikepdf.Pdf.open(document_path) as pdf:
+                    page_count = len(pdf.pages)
+            except Exception as e:
+                self.log.warning(
+                    f"Unable to determine PDF page count {document_path}: {e}",
+                )
+        return page_count

-    def extract_metadata(
-        self,
-        document_path: Path,
-        mime_type: str,
-    ) -> list[MetadataEntry]:
-        if mime_type != "application/pdf":
-            return []
+    def extract_metadata(self, document_path, mime_type):
+        result = []
+        if mime_type == "application/pdf":
+            import pikepdf

-        from paperless.parsers.utils import extract_pdf_metadata
+            namespace_pattern = re.compile(r"\{(.*)\}(.*)")

-        return extract_pdf_metadata(Path(document_path), log=self.log)
+            pdf = pikepdf.open(document_path)
+            meta = pdf.open_metadata()
+            for key, value in meta.items():
+                if isinstance(value, list):
+                    value = " ".join([str(e) for e in value])
+                value = str(value)
+                try:
+                    m = namespace_pattern.match(key)
+                    if m is None:  # pragma: no cover
+                        continue
+                    namespace = m.group(1)
+                    key_value = m.group(2)
+                    try:
+                        namespace.encode("utf-8")
+                        key_value.encode("utf-8")
+                    except UnicodeEncodeError as e:  # pragma: no cover
+                        self.log.debug(f"Skipping metadata key {key}: {e}")
+                        continue
+                    result.append(
+                        {
+                            "namespace": namespace,
+                            "prefix": meta.REVERSE_NS[namespace],
+                            "key": key_value,
+                            "value": value,
+                        },
+                    )
+                except Exception as e:
+                    self.log.warning(
+                        f"Error while reading metadata {key}: {value}. Error: {e}",
+                    )
+        return result

-    def is_image(self, mime_type: str) -> bool:
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
+        return make_thumbnail_from_pdf(
+            self.archive_path or document_path,
+            self.tempdir,
+            self.logging_group,
+        )
+
+    def is_image(self, mime_type) -> bool:
        return mime_type in [
            "image/png",
            "image/jpeg",
@@ -182,25 +111,25 @@ class RasterisedDocumentParser:
            "image/heic",
        ]

-    def has_alpha(self, image: Path) -> bool:
+    def has_alpha(self, image) -> bool:
        with Image.open(image) as im:
            return im.mode in ("RGBA", "LA")

-    def remove_alpha(self, image_path: Path) -> Path:
+    def remove_alpha(self, image_path: str) -> Path:
        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
        run_subprocess(
            [
                settings.CONVERT_BINARY,
                "-alpha",
                "off",
-                str(image_path),
-                str(no_alpha_image),
+                image_path,
+                no_alpha_image,
            ],
            logger=self.log,
        )
        return no_alpha_image

-    def get_dpi(self, image: Path) -> int | None:
+    def get_dpi(self, image) -> int | None:
        try:
            with Image.open(image) as im:
                x, _ = im.info["dpi"]
@@ -209,7 +138,7 @@ class RasterisedDocumentParser:
            self.log.warning(f"Error while getting DPI from image {image}: {e}")
            return None

-    def calculate_a4_dpi(self, image: Path) -> int | None:
+    def calculate_a4_dpi(self, image) -> int | None:
        try:
            with Image.open(image) as im:
                width, _ = im.size
@@ -227,7 +156,6 @@ class RasterisedDocumentParser:
        sidecar_file: Path | None,
        pdf_file: Path,
    ) -> str | None:
-        text: str | None = None
        # When re-doing OCR, the sidecar contains ONLY the new text, not
        # the whole text, so do not utilize it in that case
        if (
@@ -235,7 +163,7 @@ class RasterisedDocumentParser:
            and sidecar_file.is_file()
            and self.settings.mode != "redo"
        ):
-            text = read_file_handle_unicode_errors(sidecar_file)
+            text = self.read_file_handle_unicode_errors(sidecar_file)

            if "[OCR skipped on page" not in text:
                # This happens when there's already text in the input file.
@@ -263,12 +191,12 @@ class RasterisedDocumentParser:
                        "-layout",
                        "-enc",
                        "UTF-8",
-                        str(pdf_file),
+                        pdf_file,
                        tmp.name,
                    ],
                    logger=self.log,
                )
-                text = read_file_handle_unicode_errors(Path(tmp.name))
+                text = self.read_file_handle_unicode_errors(Path(tmp.name))

            return post_process_text(text)

@@ -283,14 +211,16 @@ class RasterisedDocumentParser:

    def construct_ocrmypdf_parameters(
        self,
-        input_file: Path,
-        mime_type: str,
-        output_file: Path,
-        sidecar_file: Path,
+        input_file,
+        mime_type,
+        output_file,
+        sidecar_file,
        *,
-        safe_fallback: bool = False,
-    ) -> dict[str, Any]:
-        ocrmypdf_args: dict[str, Any] = {
+        safe_fallback=False,
+    ):
+        if TYPE_CHECKING:
+            assert isinstance(self.settings, OcrConfig)
+        ocrmypdf_args = {
            "input_file_or_options": input_file,
            "output_file": output_file,
            # need to use threads, since this will be run in daemonized
@@ -400,13 +330,7 @@ class RasterisedDocumentParser:

        return ocrmypdf_args

-    def parse(
-        self,
-        document_path: Path,
-        mime_type: str,
-        *,
-        produce_archive: bool = True,
-    ) -> None:
+    def parse(self, document_path: Path, mime_type, file_name=None) -> None:
        # This forces tesseract to use one core per page.
        os.environ["OMP_THREAD_LIMIT"] = "1"
        VALID_TEXT_LENGTH = 50
@@ -534,7 +458,7 @@ class RasterisedDocumentParser:
                self.text = ""


-def post_process_text(text: str | None) -> str | None:
+def post_process_text(text):
    if not text:
        return None

--- a/src/paperless_tesseract/signals.py
+++ b/src/paperless_tesseract/signals.py
@@ -1,23 +1,10 @@
-from __future__ import annotations
+def get_parser(*args, **kwargs):
+    from paperless_tesseract.parsers import RasterisedDocumentParser

-from typing import Any
-
-
-def get_parser(*args: Any, **kwargs: Any) -> Any:
-    from paperless.parsers.tesseract import RasterisedDocumentParser
-
-    # RasterisedDocumentParser accepts logging_group for constructor compatibility but
-    # does not store or use it (no legacy DocumentParser base class).
-    # progress_callback is also not used.  Both may arrive as a positional arg
-    # (consumer) or a keyword arg (views); *args absorbs the positional form,
-    # kwargs.pop handles the keyword form.  Phase 4 will replace this signal
-    # path with the new ParserRegistry so the shim can be removed at that point.
-    kwargs.pop("logging_group", None)
-    kwargs.pop("progress_callback", None)
    return RasterisedDocumentParser(*args, **kwargs)


-def tesseract_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
+def tesseract_consumer_declaration(sender, **kwargs):
    return {
        "parser": get_parser,
        "weight": 0,
--- a/src/paperless_tesseract/tests/samples/document.webp
+++ b/src/paperless_tesseract/tests/samples/document.webp
--- a/src/paperless_tesseract/tests/samples/encrypted.pdf
+++ b/src/paperless_tesseract/tests/samples/encrypted.pdf
--- a/src/paperless_tesseract/tests/samples/multi-page-digital.pdf
+++ b/src/paperless_tesseract/tests/samples/multi-page-digital.pdf
--- a/src/paperless_tesseract/tests/samples/multi-page-images-alpha-rgb.tiff
+++ b/src/paperless_tesseract/tests/samples/multi-page-images-alpha-rgb.tiff
--- a/src/paperless_tesseract/tests/samples/multi-page-images-alpha.tiff
+++ b/src/paperless_tesseract/tests/samples/multi-page-images-alpha.tiff
--- a/src/paperless_tesseract/tests/samples/multi-page-images.pdf
+++ b/src/paperless_tesseract/tests/samples/multi-page-images.pdf
--- a/src/paperless_tesseract/tests/samples/multi-page-images.tiff
+++ b/src/paperless_tesseract/tests/samples/multi-page-images.tiff
--- a/src/paperless_tesseract/tests/samples/multi-page-mixed.pdf
+++ b/src/paperless_tesseract/tests/samples/multi-page-mixed.pdf
--- a/src/paperless_tesseract/tests/samples/no-text-alpha.png
+++ b/src/paperless_tesseract/tests/samples/no-text-alpha.png
--- a/src/paperless_tesseract/tests/samples/rotated.pdf
+++ b/src/paperless_tesseract/tests/samples/rotated.pdf
--- a/src/paperless_tesseract/tests/samples/rtl-test.pdf
+++ b/src/paperless_tesseract/tests/samples/rtl-test.pdf
--- a/src/paperless_tesseract/tests/samples/signed.pdf
+++ b/src/paperless_tesseract/tests/samples/signed.pdf
--- a/src/paperless_tesseract/tests/samples/simple-alpha.png
+++ b/src/paperless_tesseract/tests/samples/simple-alpha.png
--- a/src/paperless_tesseract/tests/samples/simple-digital.pdf
+++ b/src/paperless_tesseract/tests/samples/simple-digital.pdf
--- a/src/paperless_tesseract/tests/samples/simple-no-dpi.png
+++ b/src/paperless_tesseract/tests/samples/simple-no-dpi.png
--- a/src/paperless_tesseract/tests/samples/simple.bmp
+++ b/src/paperless_tesseract/tests/samples/simple.bmp
--- a/src/paperless_tesseract/tests/samples/simple.gif
+++ b/src/paperless_tesseract/tests/samples/simple.gif
--- a/src/paperless_tesseract/tests/samples/simple.heic
+++ b/src/paperless_tesseract/tests/samples/simple.heic
--- a/src/paperless_tesseract/tests/samples/simple.jpg
+++ b/src/paperless_tesseract/tests/samples/simple.jpg
--- a/src/paperless_tesseract/tests/samples/simple.png
+++ b/src/paperless_tesseract/tests/samples/simple.png
--- a/src/paperless_tesseract/tests/samples/simple.tif
+++ b/src/paperless_tesseract/tests/samples/simple.tif
--- a/src/paperless_tesseract/tests/samples/single-page-mixed.pdf
+++ b/src/paperless_tesseract/tests/samples/single-page-mixed.pdf
--- a/src/paperless_tesseract/tests/samples/with-form.pdf
+++ b/src/paperless_tesseract/tests/samples/with-form.pdf
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -0,0 +1,924 @@
+import shutil
+import tempfile
+import unicodedata
+import uuid
+from pathlib import Path
+from unittest import mock
+
+from django.test import TestCase
+from django.test import override_settings
+from ocrmypdf import SubprocessOutputError
+
+from documents.parsers import ParseError
+from documents.parsers import run_convert
+from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import FileSystemAssertsMixin
+from paperless_tesseract.parsers import RasterisedDocumentParser
+from paperless_tesseract.parsers import post_process_text
+
+
+class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
+
+    def assertContainsStrings(self, content, strings) -> None:
+        # Asserts that all strings appear in content, in the given order.
+        indices = []
+        for s in strings:
+            if s in content:
+                indices.append(content.index(s))
+            else:
+                self.fail(f"'{s}' is not in '{content}'")
+        self.assertListEqual(indices, sorted(indices))
+
+    def test_post_process_text(self) -> None:
+        text_cases = [
+            ("simple     string", "simple string"),
+            ("simple    newline\n   testing string", "simple newline\ntesting string"),
+            (
+                "utf-8   строка с пробелами в конце  ",
+                "utf-8 строка с пробелами в конце",
+            ),
+        ]
+
+        for source, result in text_cases:
+            actual_result = post_process_text(source)
+            self.assertEqual(
+                result,
+                actual_result,
+                f"strip_exceess_whitespace({source}) != '{result}', but '{actual_result}'",
+            )
+
+    def test_get_text_from_pdf(self) -> None:
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        text = parser.extract_text(
+            None,
+            self.SAMPLE_FILES / "simple-digital.pdf",
+        )
+
+        self.assertContainsStrings(text.strip(), ["This is a test document."])
+
+    def test_get_page_count(self) -> None:
+        """
+        GIVEN:
+            - PDF file with a single page
+            - PDF file with multiple pages
+        WHEN:
+            - The number of pages is requested
+        THEN:
+            - The method returns 1 as the expected number of pages
+            - The method returns the correct number of pages (6)
+        """
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        page_count = parser.get_page_count(
+            str(self.SAMPLE_FILES / "simple-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertEqual(page_count, 1)
+
+        page_count = parser.get_page_count(
+            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
+            "application/pdf",
+        )
+        self.assertEqual(page_count, 6)
+
+    def test_get_page_count_password_protected(self) -> None:
+        """
+        GIVEN:
+            - Password protected PDF file
+        WHEN:
+            - The number of pages is requested
+        THEN:
+            - The method returns None
+        """
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
+            page_count = parser.get_page_count(
+                str(self.SAMPLE_FILES / "password-protected.pdf"),
+                "application/pdf",
+            )
+            self.assertEqual(page_count, None)
+            self.assertIn("Unable to determine PDF page count", cm.output[0])
+
+    def test_thumbnail(self) -> None:
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        thumb = parser.get_thumbnail(
+            str(self.SAMPLE_FILES / "simple-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsFile(thumb)
+
+    @mock.patch("documents.parsers.run_convert")
+    def test_thumbnail_fallback(self, m) -> None:
+        def call_convert(input_file, output_file, **kwargs) -> None:
+            if ".pdf" in str(input_file):
+                raise ParseError("Does not compute.")
+            else:
+                run_convert(input_file=input_file, output_file=output_file, **kwargs)
+
+        m.side_effect = call_convert
+
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        thumb = parser.get_thumbnail(
+            str(self.SAMPLE_FILES / "simple-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsFile(thumb)
+
+    def test_thumbnail_encrypted(self) -> None:
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        thumb = parser.get_thumbnail(
+            str(self.SAMPLE_FILES / "encrypted.pdf"),
+            "application/pdf",
+        )
+        self.assertIsFile(thumb)
+
+    def test_get_dpi(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple-no-dpi.png"))
+        self.assertEqual(dpi, None)
+
+        dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple.png"))
+        self.assertEqual(dpi, 72)
+
+    def test_simple_digital(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(
+            str(self.SAMPLE_FILES / "simple-digital.pdf"),
+            "application/pdf",
+        )
+
+        self.assertIsFile(parser.archive_path)
+
+        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
+
+    def test_with_form(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(
+            str(self.SAMPLE_FILES / "with-form.pdf"),
+            "application/pdf",
+        )
+
+        self.assertIsFile(parser.archive_path)
+
+        self.assertContainsStrings(
+            parser.get_text(),
+            ["Please enter your name in here:", "This is a PDF document with a form."],
+        )
+
+    @override_settings(OCR_MODE="redo")
+    def test_with_form_error(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(
+            str(self.SAMPLE_FILES / "with-form.pdf"),
+            "application/pdf",
+        )
+
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text(),
+            ["Please enter your name in here:", "This is a PDF document with a form."],
+        )
+
+    @override_settings(OCR_MODE="skip")
+    def test_signed(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(str(self.SAMPLE_FILES / "signed.pdf"), "application/pdf")
+
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text(),
+            [
+                "This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable",
+                "automated testing of signed/encrypted PDFs",
+            ],
+        )
+
+    @override_settings(OCR_MODE="skip")
+    def test_encrypted(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(
+            str(self.SAMPLE_FILES / "encrypted.pdf"),
+            "application/pdf",
+        )
+
+        self.assertIsNone(parser.archive_path)
+        self.assertEqual(parser.get_text(), "")
+
+    @override_settings(OCR_MODE="redo")
+    def test_with_form_error_notext(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "with-form.pdf"),
+            "application/pdf",
+        )
+
+        self.assertContainsStrings(
+            parser.get_text(),
+            ["Please enter your name in here:", "This is a PDF document with a form."],
+        )
+
+    @override_settings(OCR_MODE="force")
+    def test_with_form_force(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(
+            str(self.SAMPLE_FILES / "with-form.pdf"),
+            "application/pdf",
+        )
+
+        self.assertContainsStrings(
+            parser.get_text(),
+            ["Please enter your name in here:", "This is a PDF document with a form."],
+        )
+
+    def test_image_simple(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(str(self.SAMPLE_FILES / "simple.png"), "image/png")
+
+        self.assertIsFile(parser.archive_path)
+
+        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
+
+    def test_image_simple_alpha(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            # Copy sample file to temp directory, as the parsing changes the file
+            # and this makes it modified to Git
+            sample_file = self.SAMPLE_FILES / "simple-alpha.png"
+            dest_file = Path(tempdir) / "simple-alpha.png"
+            shutil.copy(sample_file, dest_file)
+
+            parser.parse(str(dest_file), "image/png")
+
+            self.assertIsFile(parser.archive_path)
+
+            self.assertContainsStrings(parser.get_text(), ["This is a test document."])
+
+    def test_image_calc_a4_dpi(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        dpi = parser.calculate_a4_dpi(
+            str(self.SAMPLE_FILES / "simple-no-dpi.png"),
+        )
+
+        self.assertEqual(dpi, 62)
+
+    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
+    def test_image_dpi_fail(self, m) -> None:
+        m.return_value = None
+        parser = RasterisedDocumentParser(None)
+
+        def f() -> None:
+            parser.parse(
+                str(self.SAMPLE_FILES / "simple-no-dpi.png"),
+                "image/png",
+            )
+
+        self.assertRaises(ParseError, f)
+
+    @override_settings(OCR_IMAGE_DPI=72, MAX_IMAGE_PIXELS=0)
+    def test_image_no_dpi_default(self) -> None:
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png")
+
+        self.assertIsFile(parser.archive_path)
+
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["this is a test document."],
+        )
+
+    def test_multi_page(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsFile(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="skip")
+    def test_multi_page_pages_skip(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsFile(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
+    def test_multi_page_pages_redo(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsFile(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="force")
+    def test_multi_page_pages_force(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsFile(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_MODE="skip")
+    def test_multi_page_analog_pages_skip(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsFile(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
+    def test_multi_page_analog_pages_redo(self) -> None:
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR of only pages 1 and 2 requested
+            - OCR mode set to redo
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text of page 1 and 2 extracted
+            - An archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsFile(parser.archive_path)
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
+        self.assertNotIn("page 3", parser.get_text().lower())
+
+    @override_settings(OCR_PAGES=1, OCR_MODE="force")
+    def test_multi_page_analog_pages_force(self) -> None:
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR of only page 1 requested
+            - OCR mode set to force
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Only text of page 1 is extracted
+            - An archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsFile(parser.archive_path)
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
+        self.assertNotIn("page 2", parser.get_text().lower())
+        self.assertNotIn("page 3", parser.get_text().lower())
+
+    @override_settings(OCR_MODE="skip_noarchive")
+    def test_skip_noarchive_withtext(self) -> None:
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR mode set to skip_noarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_MODE="skip_noarchive")
+    def test_skip_noarchive_notext(self) -> None:
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR mode set to skip_noarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - An archive file is created with the OCRd text
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
+            "application/pdf",
+        )
+
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+        self.assertIsNotNone(parser.archive_path)
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
+    def test_skip_archive_never_withtext(self) -> None:
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR_SKIP_ARCHIVE_FILE set to never
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from text layer is extracted
+            - Archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
+    def test_skip_archive_never_withimages(self) -> None:
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR_SKIP_ARCHIVE_FILE set to never
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - Archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
+    def test_skip_archive_withtext_withtext(self) -> None:
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR_SKIP_ARCHIVE_FILE set to with_text
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from text layer is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
+    def test_skip_archive_withtext_withimages(self) -> None:
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR_SKIP_ARCHIVE_FILE set to with_text
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - Archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
+    def test_skip_archive_always_withtext(self) -> None:
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR_SKIP_ARCHIVE_FILE set to always
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from text layer is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
+    def test_skip_archive_always_withimages(self) -> None:
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR_SKIP_ARCHIVE_FILE set to always
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_MODE="skip")
+    def test_multi_page_mixed(self) -> None:
+        """
+        GIVEN:
+            - File with some text contained in images and some in text layer
+            - OCR mode set to skip
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - An archive file is created with the OCRd text and the original text
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertIsFile(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
+        )
+
+        with (parser.tempdir / "sidecar.txt").open() as f:
+            sidecar = f.read()
+
+        self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
+
+    @override_settings(OCR_MODE="redo")
+    def test_single_page_mixed(self) -> None:
+        """
+        GIVEN:
+            - File with some text contained in images and some in text layer
+            - Text and images are mixed on the same page
+            - OCR mode set to redo
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - Full content of the file is parsed (not just the image text)
+            - An archive file is created with the OCRd text and the original text
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "single-page-mixed.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertIsFile(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            [
+                "this is some normal text, present on page 1 of the document.",
+                "this is some text, but in an image, also on page 1.",
+                "this is further text on page 1.",
+            ],
+        )
+
+        with (parser.tempdir / "sidecar.txt").open() as f:
+            sidecar = f.read().lower()
+
+        self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
+        self.assertNotIn(
+            "this is some normal text, present on page 1 of the document.",
+            sidecar,
+        )
+
+    @override_settings(OCR_MODE="skip_noarchive")
+    def test_multi_page_mixed_no_archive(self) -> None:
+        """
+        GIVEN:
+            - File with some text contained in images and some in text layer
+            - OCR mode set to skip_noarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created as original file contains text
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 4", "page 5", "page 6"],
+        )
+
+    @override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
+    def test_rotate(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(str(self.SAMPLE_FILES / "rotated.pdf"), "application/pdf")
+        self.assertContainsStrings(
+            parser.get_text(),
+            [
+                "This is the text that appears on the first page. It’s a lot of text.",
+                "Even if the pages are rotated, OCRmyPDF still gets the job done.",
+                "This is a really weird file with lots of nonsense text.",
+                "If you read this, it’s your own fault. Also check your screen orientation.",
+            ],
+        )
+
+    def test_multi_page_tiff(self) -> None:
+        """
+        GIVEN:
+            - Multi-page TIFF image
+        WHEN:
+            - Image is parsed
+        THEN:
+            - Text from all pages extracted
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "multi-page-images.tiff"),
+            "image/tiff",
+        )
+        self.assertIsFile(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    def test_multi_page_tiff_alpha(self) -> None:
+        """
+        GIVEN:
+            - Multi-page TIFF image
+            - Image include an alpha channel
+        WHEN:
+            - Image is parsed
+        THEN:
+            - Text from all pages extracted
+        """
+        parser = RasterisedDocumentParser(None)
+        sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff"
+        with tempfile.NamedTemporaryFile() as tmp_file:
+            shutil.copy(sample_file, tmp_file.name)
+            parser.parse(
+                tmp_file.name,
+                "image/tiff",
+            )
+            self.assertIsFile(parser.archive_path)
+            self.assertContainsStrings(
+                parser.get_text().lower(),
+                ["page 1", "page 2", "page 3"],
+            )
+
+    def test_multi_page_tiff_alpha_srgb(self) -> None:
+        """
+        GIVEN:
+            - Multi-page TIFF image
+            - Image include an alpha channel
+            - Image is srgb colorspace
+        WHEN:
+            - Image is parsed
+        THEN:
+            - Text from all pages extracted
+        """
+        parser = RasterisedDocumentParser(None)
+        sample_file = str(
+            self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff",
+        )
+        with tempfile.NamedTemporaryFile() as tmp_file:
+            shutil.copy(sample_file, tmp_file.name)
+            parser.parse(
+                tmp_file.name,
+                "image/tiff",
+            )
+            self.assertIsFile(parser.archive_path)
+            self.assertContainsStrings(
+                parser.get_text().lower(),
+                ["page 1", "page 2", "page 3"],
+            )
+
+    def test_ocrmypdf_parameters(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        params = parser.construct_ocrmypdf_parameters(
+            input_file="input.pdf",
+            output_file="output.pdf",
+            sidecar_file="sidecar.txt",
+            mime_type="application/pdf",
+            safe_fallback=False,
+        )
+
+        self.assertEqual(params["input_file_or_options"], "input.pdf")
+        self.assertEqual(params["output_file"], "output.pdf")
+        self.assertEqual(params["sidecar"], "sidecar.txt")
+
+        with override_settings(OCR_CLEAN="none"):
+            parser = RasterisedDocumentParser(None)
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertNotIn("clean", params)
+            self.assertNotIn("clean_final", params)
+
+        with override_settings(OCR_CLEAN="clean"):
+            parser = RasterisedDocumentParser(None)
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertTrue(params["clean"])
+            self.assertNotIn("clean_final", params)
+
+        with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
+            parser = RasterisedDocumentParser(None)
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertTrue(params["clean_final"])
+            self.assertNotIn("clean", params)
+
+        with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
+            parser = RasterisedDocumentParser(None)
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertTrue(params["clean"])
+            self.assertNotIn("clean_final", params)
+
+        with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
+            parser = RasterisedDocumentParser(None)
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertTrue(params["deskew"])
+
+        with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
+            parser = RasterisedDocumentParser(None)
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertNotIn("deskew", params)
+
+        with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
+            parser = RasterisedDocumentParser(None)
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertNotIn("deskew", params)
+
+        with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
+            parser = RasterisedDocumentParser(None)
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertIn("max_image_mpixels", params)
+            self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
+
+        with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
+            parser = RasterisedDocumentParser(None)
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertNotIn("max_image_mpixels", params)
+
+    def test_rtl_language_detection(self) -> None:
+        """
+        GIVEN:
+            - File with text in an RTL language
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from the document is extracted
+        """
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(
+            str(self.SAMPLE_FILES / "rtl-test.pdf"),
+            "application/pdf",
+        )
+
+        # OCR output for RTL text varies across platforms/versions due to
+        # bidi controls and presentation forms; normalize before assertion.
+        normalized_text = "".join(
+            char
+            for char in unicodedata.normalize("NFKC", parser.get_text())
+            if unicodedata.category(char) != "Cf" and not char.isspace()
+        )
+
+        self.assertIn("ةرازو", normalized_text)
+        self.assertTrue(
+            any(token in normalized_text for token in ("ةیلخادلا", "الاخليد")),
+        )
+
+    @mock.patch("ocrmypdf.ocr")
+    def test_gs_rendering_error(self, m) -> None:
+        m.side_effect = SubprocessOutputError("Ghostscript PDF/A rendering failed")
+        parser = RasterisedDocumentParser(None)
+
+        self.assertRaises(
+            ParseError,
+            parser.parse,
+            str(self.SAMPLE_FILES / "simple-digital.pdf"),
+            "application/pdf",
+        )
+
+
+class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+    SAMPLE_FILES = Path(__file__).parent / "samples"
+
+    def test_bmp(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(str(self.SAMPLE_FILES / "simple.bmp"), "image/bmp")
+        self.assertIsFile(parser.archive_path)
+        self.assertIn("this is a test document", parser.get_text().lower())
+
+    def test_jpg(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(str(self.SAMPLE_FILES / "simple.jpg"), "image/jpeg")
+        self.assertIsFile(parser.archive_path)
+        self.assertIn("this is a test document", parser.get_text().lower())
+
+    def test_heic(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(str(self.SAMPLE_FILES / "simple.heic"), "image/heic")
+        self.assertIsFile(parser.archive_path)
+        self.assertIn("pizza", parser.get_text().lower())
+
+    @override_settings(OCR_IMAGE_DPI=200)
+    def test_gif(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(str(self.SAMPLE_FILES / "simple.gif"), "image/gif")
+        self.assertIsFile(parser.archive_path)
+        self.assertIn("this is a test document", parser.get_text().lower())
+
+    def test_tiff(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(str(self.SAMPLE_FILES / "simple.tif"), "image/tiff")
+        self.assertIsFile(parser.archive_path)
+        self.assertIn("this is a test document", parser.get_text().lower())
+
+    @override_settings(OCR_IMAGE_DPI=72)
+    def test_webp(self) -> None:
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            str(self.SAMPLE_FILES / "document.webp"),
+            "image/webp",
+        )
+        self.assertIsFile(parser.archive_path)
+        # Older tesseracts consistently mangle the space between "a webp",
+        # tesseract 5.3.0 seems to do a better job, so we're accepting both
+        self.assertRegex(
+            parser.get_text().lower(),
+            r"this is a ?webp document, created 11/14/2022.",
+        )
--- a/src/paperless/tests/parsers/test_tesseract_custom_settings.py
+++ b/src/paperless/tests/parsers/test_tesseract_custom_settings.py
@@ -10,7 +10,7 @@ from paperless.models import CleanChoices
 from paperless.models import ColorConvertChoices
 from paperless.models import ModeChoices
 from paperless.models import OutputTypeChoices
-from paperless.parsers.tesseract import RasterisedDocumentParser
+from paperless_tesseract.parsers import RasterisedDocumentParser


 class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -1,9 +1,4 @@
-from __future__ import annotations
-
-from typing import Any
-
-
-def get_parser(*args: Any, **kwargs: Any) -> Any:
+def get_parser(*args, **kwargs):
    from paperless.parsers.text import TextDocumentParser

    # TextDocumentParser accepts logging_group for constructor compatibility but
@@ -14,10 +9,10 @@ def get_parser(*args: Any, **kwargs: Any) -> Any:
    # path with the new ParserRegistry so the shim can be removed at that point.
    kwargs.pop("logging_group", None)
    kwargs.pop("progress_callback", None)
-    return TextDocumentParser(*args, **kwargs)
+    return TextDocumentParser()


-def text_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
+def text_consumer_declaration(sender, **kwargs):
    return {
        "parser": get_parser,
        "weight": 10,
--- a/uv.lock
+++ b/uv.lock
@@ -4754,11 +4754,11 @@ wheels = [

 [[package]]
 name = "tinytag"
-version = "2.2.1"
+version = "2.2.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/96/59/8a8cb2331e2602b53e4dc06960f57d1387a2b18e7efd24e5f9cb60ea4925/tinytag-2.2.1.tar.gz", hash = "sha256:e6d06610ebe7cd66fd07be2d3b9495914ab32654a5e47657bb8cd44c2484523c", size = 38214, upload-time = "2026-03-15T18:48:01.11Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/98/07/fb260bac73119f369a10e884016516d07cd760b5068e703773f83dd5e7bf/tinytag-2.2.0.tar.gz", hash = "sha256:f15b082510f6e0fc717e597edc8759d6f2d3ff6194ac0f3bcd675a9a09d9b798", size = 38120, upload-time = "2025-12-15T21:10:19.093Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ce/34/d50e338631baaf65ec5396e70085e5de0b52b24b28db1ffbc1c6e82190dc/tinytag-2.2.1-py3-none-any.whl", hash = "sha256:ed8b1e6d25367937e3321e054f4974f9abfde1a3e0a538824c87da377130c2b6", size = 32927, upload-time = "2026-03-15T18:47:59.613Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/e2/9818fcebb348237389d2ac2fea97cf2b2638378a0866105a45ae9be49728/tinytag-2.2.0-py3-none-any.whl", hash = "sha256:d2cf3ef8ee0f6c854663f77d9d5f8159ee1c834c70f5ea4f214ddc4af8148f79", size = 32861, upload-time = "2025-12-15T21:10:17.63Z" },
 ]

 [[package]]
@@ -5643,7 +5643,7 @@ wheels = [

 [[package]]
 name = "zensical"
-version = "0.0.26"
+version = "0.0.25"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -5653,18 +5653,16 @@ dependencies = [
    { name = "pymdown-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "pyyaml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/d5/1f/0a0b1ce8e0553a9dabaedc736d0f34b11fc33d71ff46bce44d674996d41f/zensical-0.0.26.tar.gz", hash = "sha256:f4d9c8403df25fbb3d6dd9577122dc2f23c73a2d16ab778bb7d40370dd71e987", size = 3841473, upload-time = "2026-03-11T09:51:38.838Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/18/69/4b49ce778059b4888ea854cf4db40e1b2080fe828b7280198999048d6fce/zensical-0.0.25.tar.gz", hash = "sha256:462808359d949469fa7209d367f2e38ed796744074e5dadeac9ddfef0c44be25", size = 3841318, upload-time = "2026-03-10T19:32:35.048Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/41/58/fa3d9538ff1ea8cf4a193edbf47254f374fa7983fcfa876bb4336d72c53a/zensical-0.0.26-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:7823b25afe7d36099253aa59d643abaac940f80fd015d4a37954210c87d3da56", size = 12263607, upload-time = "2026-03-11T09:50:49.202Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/6e/44a3b21bd3569b9cad203364d73a956768d28a879e4c2be91bd889f74d2c/zensical-0.0.26-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:c0254814382cdd3769bc7689180d09bf41de8879871dd736dc52d5f141e8ada7", size = 12144562, upload-time = "2026-03-11T09:50:53.685Z" },
-    { url = "https://files.pythonhosted.org/packages/07/ae/31b9885745b3e7ef23a3ae7f175b879807288d11b3fb7e2d3c119c916258/zensical-0.0.26-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c8e601b2bbd239e564b04cf235eefb9777e7dfc7e1857b8871d6cdcfb577aa0", size = 12506728, upload-time = "2026-03-11T09:50:57.775Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/93/f5291e2c47076474f181f6eef35ef0428117d3f192da4358c0511e2ce09e/zensical-0.0.26-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2dc43c7e6c25d9724fc0450f0273ca4e5e2506eeb7f89f52f1405a592896ca3b", size = 12454975, upload-time = "2026-03-11T09:51:01.514Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/2e/61cac4f2ebad31dab768eb02753ffde9e56d4d34b8f876b949bf516fbd50/zensical-0.0.26-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24ed236d1254cc474c19227eaa3670a1ccf921af53134ec5542b05853bdcd59c", size = 12791930, upload-time = "2026-03-11T09:51:05.162Z" },
-    { url = "https://files.pythonhosted.org/packages/02/86/51995d1ed2dd6ad8a1a70bcdf3c5eb16b50e62ea70e638d454a6b9061c4d/zensical-0.0.26-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1110147710d1dd025d932c4a7eada836bdf079c91b70fb0ae5b202e14b094617", size = 12548166, upload-time = "2026-03-11T09:51:09.218Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/93/decbafdbfc77170cbc3851464632390846e9aaf45e743c8dd5a24d5673e9/zensical-0.0.26-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:7d21596a785428cdebc20859bd94a05334abe14ad24f1bb9cd80d19219e3c220", size = 12682103, upload-time = "2026-03-11T09:51:12.68Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/e2/391d2d08dde621177da069a796a886b549fefb15734aeeb6e696af99b662/zensical-0.0.26-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:680a3c7bb71499b4da784d6072e44b3d7b8c0df3ce9bbd9974e24bd8058c2736", size = 12724219, upload-time = "2026-03-11T09:51:17.32Z" },
-    { url = "https://files.pythonhosted.org/packages/80/2a/21b40c5c40a67da8a841f278d61dbd8d5e035e489de6fe1cef5f4e211b4f/zensical-0.0.26-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:e3294a79f98218b6fc2219232e166aa0932ae4dad58f6c8dbc0dbe0ecbff9c25", size = 12862117, upload-time = "2026-03-11T09:51:22.161Z" },
-    { url = "https://files.pythonhosted.org/packages/51/76/e1910d6d75d207654c867b8efbda6822dedda9fed3601bf4a864a1f4fe26/zensical-0.0.26-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:630229587df1fb47be184a4a69d0772ce59a44cd2c481ae9f7e8852fffaff11e", size = 12815714, upload-time = "2026-03-11T09:51:26.24Z" },
+    { url = "https://files.pythonhosted.org/packages/42/7c/f6f5eb1903b5a557d98f48d09e3d4bc33033ed78508986250dabe5387d91/zensical-0.0.25-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3c481dd16a968f97d43f6b596e10e941d8294ed446b8b117235a6b149c0d6965", size = 12263809, upload-time = "2026-03-10T19:31:49.907Z" },
+    { url = "https://files.pythonhosted.org/packages/37/b2/3f8be43526a68c52c84f099887d1903c2526a22aa4344378a72671bf6070/zensical-0.0.25-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ae51751e8b11f50df04641b40c1e07d4b703fed9d9548b16dbcb0cf260da229a", size = 12146107, upload-time = "2026-03-10T19:31:53.576Z" },
+    { url = "https://files.pythonhosted.org/packages/16/59/89a3a715b1fe538b4b5ee382d71b86bd06d4f351383e36eefd36e824c150/zensical-0.0.25-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56ccf88245bd0b3684bf313384164972f1890802d4a51dd9b7ae6ea126a810bc", size = 12505963, upload-time = "2026-03-10T19:31:57.517Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/5b/cc0bada291818bdf36be777af9c16f655a021f16578a31e6fb233affca03/zensical-0.0.25-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2f4e58bcc06f3e50cc518666a0c9d8f82246255a42b37bb1d7c7343e214fbac", size = 12455496, upload-time = "2026-03-10T19:32:02.37Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/16/ff91ee42d8b14a1b63e2e0d74922e6c4b0ec1da3819377f20b7ca2742f76/zensical-0.0.25-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:69895273b1319a45667abac543c3e5065ff2a646d9a698eae056b6a35b57e00a", size = 12683609, upload-time = "2026-03-10T19:32:06.144Z" },
+    { url = "https://files.pythonhosted.org/packages/01/fd/a85acc4234d31658f4bb54c4900edfc8d4227ad83e4c79de92cfdcd05c79/zensical-0.0.25-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:c51a00ae1de2e9647bfd0ea1965b223fb3891111a00930416e1277e06f3ab3c4", size = 12725420, upload-time = "2026-03-10T19:32:09.938Z" },
+    { url = "https://files.pythonhosted.org/packages/37/c7/896c91e457af3d5769d8d70d2bd66a8a287ad129879b51ab5e985ac68889/zensical-0.0.25-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:28e56ec1f06ea66227c1f5af9d7a6ed3bd4246e6af1e45d29e09f40251b52e1f", size = 12861970, upload-time = "2026-03-10T19:32:13.471Z" },
+    { url = "https://files.pythonhosted.org/packages/41/06/5d804cf19e4e093394674d9f213546dc1364a34fd85d81a1153b05733c5a/zensical-0.0.25-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d2d5997baad148b65eb0de6baf81973110538e01a3f64467d06d0c5ac23b0d70", size = 12816321, upload-time = "2026-03-10T19:32:17.031Z" },
 ]

 [[package]]
Author	SHA1	Message	Date
Trenton H	6a4da4c46e	Adds a docstring that an IDE will render better	2026-03-18 15:26:10 -07:00
Trenton H	741486df16	Handles the rename of the migration	2026-03-18 15:23:37 -07:00
Trenton H	cdbb118f1c	Fixes logging so I can see it	2026-03-18 15:22:18 -07:00
Trenton H	cea5971ad8	Batch based iteration and bulk updates, with chunked file reading	2026-03-18 15:22:18 -07:00
Trenton H	156ee4e2ee	Transitions to SHA256 based checksums	2026-03-18 15:22:17 -07:00