Moves the checks and tests to the main application and removes the old applications

refactor: remove empty paperless_text and paperless_tika Django apps
After parser classes were moved to paperless/parsers/ in the plugin refactor, these Django apps contained only empty AppConfig classes with no models, views, tasks, migrations, or other functionality. - Remove paperless_text and paperless_tika from INSTALLED_APPS - Delete empty app directories entirely - Update pyproject.toml test exclusions - Clean stale mypy baseline entries for moved parser files paperless_remote app is retained as it contains meaningful system checks for Azure AI configuration. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 16:05:56 +00:00 · 2026-03-20 07:58:08 -07:00 · 2026-03-19 19:45:57 -07:00 · 2026-03-19 18:57:13 -07:00 · 2026-03-19 17:19:06 -07:00 · 2026-03-19 15:45:08 -07:00
102 changed files with 4812 additions and 2983 deletions
--- a/.mypy-baseline.txt
+++ b/.mypy-baseline.txt
@@ -2437,17 +2437,3 @@ src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "Non
 src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean"  [union-attr]
 src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean"  [union-attr]
 src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "user_args"  [union-attr]
-src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments  [no-untyped-def]
-src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments  [no-untyped-def]
-src/paperless_text/parsers.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "None")  [assignment]
-src/paperless_text/signals.py:0: error: Function is missing a type annotation  [no-untyped-def]
-src/paperless_text/signals.py:0: error: Function is missing a type annotation  [no-untyped-def]
-src/paperless_tika/parsers.py:0: error: Argument 1 to "make_thumbnail_from_pdf" has incompatible type "None"; expected "Path"  [arg-type]
-src/paperless_tika/parsers.py:0: error: Function is missing a return type annotation  [no-untyped-def]
-src/paperless_tika/parsers.py:0: error: Function is missing a type annotation  [no-untyped-def]
-src/paperless_tika/parsers.py:0: error: Function is missing a type annotation  [no-untyped-def]
-src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments  [no-untyped-def]
-src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments  [no-untyped-def]
-src/paperless_tika/parsers.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "None")  [assignment]
-src/paperless_tika/signals.py:0: error: Function is missing a type annotation  [no-untyped-def]
-src/paperless_tika/signals.py:0: error: Function is missing a type annotation  [no-untyped-def]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -248,15 +248,13 @@ lint.per-file-ignores."docker/wait-for-redis.py" = [
 lint.per-file-ignores."src/documents/models.py" = [
  "SIM115",
 ]
-lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
-  "RUF001",
-]
+
 lint.isort.force-single-line = true

 [tool.codespell]
 write-changes = true
 ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
-skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/documents/tests/samples/*,*.po,*.json"
+skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples/mail/*,src/documents/tests/samples/*,*.po,*.json"

 [tool.pytest]
 minversion = "9.0"
@@ -271,10 +269,6 @@ testpaths = [
  "src/documents/tests/",
  "src/paperless/tests/",
  "src/paperless_mail/tests/",
-  "src/paperless_tesseract/tests/",
-  "src/paperless_tika/tests",
-  "src/paperless_text/tests/",
-  "src/paperless_remote/tests/",
  "src/paperless_ai/tests",
 ]

--- a/src/documents/checks.py
+++ b/src/documents/checks.py
@@ -3,25 +3,20 @@ from django.core.checks import Error
 from django.core.checks import Warning
 from django.core.checks import register

-from documents.signals import document_consumer_declaration
 from documents.templating.utils import convert_format_str_to_template_format
+from paperless.parsers.registry import get_parser_registry


@register()
 def parser_check(app_configs, **kwargs):
-    parsers = []
-    for response in document_consumer_declaration.send(None):
-        parsers.append(response[1])
-
-    if len(parsers) == 0:
+    if not get_parser_registry().all_parsers():
        return [
            Error(
                "No parsers found. This is a bug. The consumer won't be "
                "able to consume any documents without parsers.",
            ),
        ]
-    else:
-        return []
+    return []


@register()
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -32,9 +32,7 @@ from documents.models import DocumentType
 from documents.models import StoragePath
 from documents.models import Tag
 from documents.models import WorkflowTrigger
-from documents.parsers import DocumentParser
 from documents.parsers import ParseError
-from documents.parsers import get_parser_class_for_mime_type
 from documents.permissions import set_permissions_for_object
 from documents.plugins.base import AlwaysRunPluginMixin
 from documents.plugins.base import ConsumeTaskPlugin
@@ -51,29 +49,13 @@ from documents.templating.workflows import parse_w_workflow_placeholders
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
-from paperless.parsers.text import TextDocumentParser
-from paperless.parsers.tika import TikaDocumentParser
-from paperless_mail.parsers import MailDocumentParser
+from paperless.parsers import ParserContext
+from paperless.parsers import ParserProtocol
+from paperless.parsers.registry import get_parser_registry

 LOGGING_NAME: Final[str] = "paperless.consumer"


-def _parser_cleanup(parser: DocumentParser) -> None:
-    """
-    Call cleanup on a parser, handling the new-style context-manager parsers.
-
-    New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown
-    instead of a cleanup() method.  This shim will be removed once all existing parsers
-    have switched to the new style and this consumer is updated to use it
-
-    TODO(stumpylog): Remove me in the future
-    """
-    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
-        parser.__exit__(None, None, None)
-    else:
-        parser.cleanup()
-
-
 class WorkflowTriggerPlugin(
    NoCleanupPluginMixin,
    NoSetupPluginMixin,
@@ -410,8 +392,12 @@ class ConsumerPlugin(
                    self.log.error(f"Error attempting to clean PDF: {e}")

            # Based on the mime type, get the parser for that type
-            parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
-                mime_type,
+            parser_class: type[ParserProtocol] | None = (
+                get_parser_registry().get_parser_for_file(
+                    mime_type,
+                    self.filename,
+                    self.working_copy,
+                )
            )
            if not parser_class:
                tempdir.cleanup()
@@ -434,307 +420,275 @@ class ConsumerPlugin(
                tempdir.cleanup()
            raise

-        def progress_callback(
-            current_progress,
-            max_progress,
-        ) -> None:  # pragma: no cover
-            # recalculate progress to be within 20 and 80
-            p = int((current_progress / max_progress) * 50 + 20)
-            self._send_progress(p, 100, ProgressStatusOptions.WORKING)
-
        # This doesn't parse the document yet, but gives us a parser.
-
-        document_parser: DocumentParser = parser_class(
-            self.logging_group,
-            progress_callback=progress_callback,
-        )
-
-        # New-style parsers use __enter__/__exit__ for resource management.
-        # _parser_cleanup (below) handles __exit__; call __enter__ here.
-        # TODO(stumpylog): Remove me in the future
-        if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
-            document_parser.__enter__()
-
-        self.log.debug(f"Parser: {type(document_parser).__name__}")
-
-        # Parse the document. This may take some time.
-
-        text = None
-        date = None
-        thumbnail = None
-        archive_path = None
-        page_count = None
-
-        try:
-            self._send_progress(
-                20,
-                100,
-                ProgressStatusOptions.WORKING,
-                ConsumerStatusShortMessage.PARSING_DOCUMENT,
+        with parser_class() as document_parser:
+            document_parser.configure(
+                ParserContext(mailrule_id=self.input_doc.mailrule_id),
            )
-            self.log.debug(f"Parsing {self.filename}...")
-            if (
-                isinstance(document_parser, MailDocumentParser)
-                and self.input_doc.mailrule_id
-            ):
-                document_parser.parse(
-                    self.working_copy,
-                    mime_type,
-                    self.filename,
-                    self.input_doc.mailrule_id,
-                )
-            elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
-                # TODO(stumpylog): Remove me in the future
-                document_parser.parse(self.working_copy, mime_type)
-            else:
-                document_parser.parse(self.working_copy, mime_type, self.filename)

-            self.log.debug(f"Generating thumbnail for {self.filename}...")
-            self._send_progress(
-                70,
-                100,
-                ProgressStatusOptions.WORKING,
-                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
-            )
-            if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
-                # TODO(stumpylog): Remove me in the future
-                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
-            else:
-                thumbnail = document_parser.get_thumbnail(
-                    self.working_copy,
-                    mime_type,
-                    self.filename,
-                )
+            self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}")

-            text = document_parser.get_text()
-            date = document_parser.get_date()
-            if date is None:
+            # Parse the document. This may take some time.
+
+            text = None
+            date = None
+            thumbnail = None
+            archive_path = None
+            page_count = None
+
+            try:
                self._send_progress(
-                    90,
+                    20,
                    100,
                    ProgressStatusOptions.WORKING,
-                    ConsumerStatusShortMessage.PARSE_DATE,
+                    ConsumerStatusShortMessage.PARSING_DOCUMENT,
                )
-                with get_date_parser() as date_parser:
-                    date = next(date_parser.parse(self.filename, text), None)
-            archive_path = document_parser.get_archive_path()
-            page_count = document_parser.get_page_count(self.working_copy, mime_type)
+                self.log.debug(f"Parsing {self.filename}...")

-        except ParseError as e:
-            _parser_cleanup(document_parser)
-            if tempdir:
-                tempdir.cleanup()
-            self._fail(
-                str(e),
-                f"Error occurred while consuming document {self.filename}: {e}",
-                exc_info=True,
-                exception=e,
-            )
-        except Exception as e:
-            _parser_cleanup(document_parser)
-            if tempdir:
-                tempdir.cleanup()
-            self._fail(
-                str(e),
-                f"Unexpected error while consuming document {self.filename}: {e}",
-                exc_info=True,
-                exception=e,
-            )
+                document_parser.parse(self.working_copy, mime_type)

-        # Prepare the document classifier.
+                self.log.debug(f"Generating thumbnail for {self.filename}...")
+                self._send_progress(
+                    70,
+                    100,
+                    ProgressStatusOptions.WORKING,
+                    ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
+                )
+                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)

-        # TODO: I don't really like to do this here, but this way we avoid
-        #   reloading the classifier multiple times, since there are multiple
-        #   post-consume hooks that all require the classifier.
-
-        classifier = load_classifier()
-
-        self._send_progress(
-            95,
-            100,
-            ProgressStatusOptions.WORKING,
-            ConsumerStatusShortMessage.SAVE_DOCUMENT,
-        )
-        # now that everything is done, we can start to store the document
-        # in the system. This will be a transaction and reasonably fast.
-        try:
-            with transaction.atomic():
-                # store the document.
-                if self.input_doc.root_document_id:
-                    # If this is a new version of an existing document, we need
-                    # to make sure we're not creating a new document, but updating
-                    # the existing one.
-                    root_doc = Document.objects.get(
-                        pk=self.input_doc.root_document_id,
+                text = document_parser.get_text()
+                date = document_parser.get_date()
+                if date is None:
+                    self._send_progress(
+                        90,
+                        100,
+                        ProgressStatusOptions.WORKING,
+                        ConsumerStatusShortMessage.PARSE_DATE,
                    )
-                    original_document = self._create_version_from_root(
-                        root_doc,
-                        text=text,
-                        page_count=page_count,
-                        mime_type=mime_type,
-                    )
-                    actor = None
+                    with get_date_parser() as date_parser:
+                        date = next(date_parser.parse(self.filename, text), None)
+                archive_path = document_parser.get_archive_path()
+                page_count = document_parser.get_page_count(
+                    self.working_copy,
+                    mime_type,
+                )

-                    # Save the new version, potentially creating an audit log entry for the version addition if enabled.
-                    if (
-                        settings.AUDIT_LOG_ENABLED
-                        and self.metadata.actor_id is not None
-                    ):
-                        actor = User.objects.filter(pk=self.metadata.actor_id).first()
-                        if actor is not None:
-                            from auditlog.context import (  # type: ignore[import-untyped]
-                                set_actor,
-                            )
+            except ParseError as e:
+                if tempdir:
+                    tempdir.cleanup()
+                self._fail(
+                    str(e),
+                    f"Error occurred while consuming document {self.filename}: {e}",
+                    exc_info=True,
+                    exception=e,
+                )
+            except Exception as e:
+                if tempdir:
+                    tempdir.cleanup()
+                self._fail(
+                    str(e),
+                    f"Unexpected error while consuming document {self.filename}: {e}",
+                    exc_info=True,
+                    exception=e,
+                )

-                            with set_actor(actor):
+            # Prepare the document classifier.
+
+            # TODO: I don't really like to do this here, but this way we avoid
+            #   reloading the classifier multiple times, since there are multiple
+            #   post-consume hooks that all require the classifier.
+
+            classifier = load_classifier()
+
+            self._send_progress(
+                95,
+                100,
+                ProgressStatusOptions.WORKING,
+                ConsumerStatusShortMessage.SAVE_DOCUMENT,
+            )
+            # now that everything is done, we can start to store the document
+            # in the system. This will be a transaction and reasonably fast.
+            try:
+                with transaction.atomic():
+                    # store the document.
+                    if self.input_doc.root_document_id:
+                        # If this is a new version of an existing document, we need
+                        # to make sure we're not creating a new document, but updating
+                        # the existing one.
+                        root_doc = Document.objects.get(
+                            pk=self.input_doc.root_document_id,
+                        )
+                        original_document = self._create_version_from_root(
+                            root_doc,
+                            text=text,
+                            page_count=page_count,
+                            mime_type=mime_type,
+                        )
+                        actor = None
+
+                        # Save the new version, potentially creating an audit log entry for the version addition if enabled.
+                        if (
+                            settings.AUDIT_LOG_ENABLED
+                            and self.metadata.actor_id is not None
+                        ):
+                            actor = User.objects.filter(
+                                pk=self.metadata.actor_id,
+                            ).first()
+                            if actor is not None:
+                                from auditlog.context import (  # type: ignore[import-untyped]
+                                    set_actor,
+                                )
+
+                                with set_actor(actor):
+                                    original_document.save()
+                            else:
                                original_document.save()
                        else:
                            original_document.save()
+
+                        # Create a log entry for the version addition, if enabled
+                        if settings.AUDIT_LOG_ENABLED:
+                            from auditlog.models import (  # type: ignore[import-untyped]
+                                LogEntry,
+                            )
+
+                            LogEntry.objects.log_create(
+                                instance=root_doc,
+                                changes={
+                                    "Version Added": ["None", original_document.id],
+                                },
+                                action=LogEntry.Action.UPDATE,
+                                actor=actor,
+                                additional_data={
+                                    "reason": "Version added",
+                                    "version_id": original_document.id,
+                                },
+                            )
+                        document = original_document
                    else:
-                        original_document.save()
-
-                    # Create a log entry for the version addition, if enabled
-                    if settings.AUDIT_LOG_ENABLED:
-                        from auditlog.models import (  # type: ignore[import-untyped]
-                            LogEntry,
+                        document = self._store(
+                            text=text,
+                            date=date,
+                            page_count=page_count,
+                            mime_type=mime_type,
                        )

-                        LogEntry.objects.log_create(
-                            instance=root_doc,
-                            changes={
-                                "Version Added": ["None", original_document.id],
-                            },
-                            action=LogEntry.Action.UPDATE,
-                            actor=actor,
-                            additional_data={
-                                "reason": "Version added",
-                                "version_id": original_document.id,
-                            },
-                        )
-                    document = original_document
-                else:
-                    document = self._store(
-                        text=text,
-                        date=date,
-                        page_count=page_count,
-                        mime_type=mime_type,
-                    )
+                    # If we get here, it was successful. Proceed with post-consume
+                    # hooks. If they fail, nothing will get changed.

-                # If we get here, it was successful. Proceed with post-consume
-                # hooks. If they fail, nothing will get changed.
-
-                document_consumption_finished.send(
-                    sender=self.__class__,
-                    document=document,
-                    logging_group=self.logging_group,
-                    classifier=classifier,
-                    original_file=self.unmodified_original
-                    if self.unmodified_original
-                    else self.working_copy,
-                )
-
-                # After everything is in the database, copy the files into
-                # place. If this fails, we'll also rollback the transaction.
-                with FileLock(settings.MEDIA_LOCK):
-                    generated_filename = generate_unique_filename(document)
-                    if (
-                        len(str(generated_filename))
-                        > Document.MAX_STORED_FILENAME_LENGTH
-                    ):
-                        self.log.warning(
-                            "Generated source filename exceeds db path limit, falling back to default naming",
-                        )
-                        generated_filename = generate_filename(
-                            document,
-                            use_format=False,
-                        )
-                    document.filename = generated_filename
-                    create_source_path_directory(document.source_path)
-
-                    self._write(
-                        self.unmodified_original
-                        if self.unmodified_original is not None
+                    document_consumption_finished.send(
+                        sender=self.__class__,
+                        document=document,
+                        logging_group=self.logging_group,
+                        classifier=classifier,
+                        original_file=self.unmodified_original
+                        if self.unmodified_original
                        else self.working_copy,
-                        document.source_path,
                    )

-                    self._write(
-                        thumbnail,
-                        document.thumbnail_path,
-                    )
-
-                    if archive_path and Path(archive_path).is_file():
-                        generated_archive_filename = generate_unique_filename(
-                            document,
-                            archive_filename=True,
-                        )
+                    # After everything is in the database, copy the files into
+                    # place. If this fails, we'll also rollback the transaction.
+                    with FileLock(settings.MEDIA_LOCK):
+                        generated_filename = generate_unique_filename(document)
                        if (
-                            len(str(generated_archive_filename))
+                            len(str(generated_filename))
                            > Document.MAX_STORED_FILENAME_LENGTH
                        ):
                            self.log.warning(
-                                "Generated archive filename exceeds db path limit, falling back to default naming",
+                                "Generated source filename exceeds db path limit, falling back to default naming",
                            )
-                            generated_archive_filename = generate_filename(
+                            generated_filename = generate_filename(
                                document,
-                                archive_filename=True,
                                use_format=False,
                            )
-                        document.archive_filename = generated_archive_filename
-                        create_source_path_directory(document.archive_path)
+                        document.filename = generated_filename
+                        create_source_path_directory(document.source_path)
+
                        self._write(
-                            archive_path,
-                            document.archive_path,
+                            self.unmodified_original
+                            if self.unmodified_original is not None
+                            else self.working_copy,
+                            document.source_path,
                        )

-                        with Path(archive_path).open("rb") as f:
-                            document.archive_checksum = hashlib.md5(
-                                f.read(),
-                            ).hexdigest()
+                        self._write(
+                            thumbnail,
+                            document.thumbnail_path,
+                        )

-                # Don't save with the lock active. Saving will cause the file
-                # renaming logic to acquire the lock as well.
-                # This triggers things like file renaming
-                document.save()
+                        if archive_path and Path(archive_path).is_file():
+                            generated_archive_filename = generate_unique_filename(
+                                document,
+                                archive_filename=True,
+                            )
+                            if (
+                                len(str(generated_archive_filename))
+                                > Document.MAX_STORED_FILENAME_LENGTH
+                            ):
+                                self.log.warning(
+                                    "Generated archive filename exceeds db path limit, falling back to default naming",
+                                )
+                                generated_archive_filename = generate_filename(
+                                    document,
+                                    archive_filename=True,
+                                    use_format=False,
+                                )
+                            document.archive_filename = generated_archive_filename
+                            create_source_path_directory(document.archive_path)
+                            self._write(
+                                archive_path,
+                                document.archive_path,
+                            )

-                if document.root_document_id:
-                    document_updated.send(
-                        sender=self.__class__,
-                        document=document.root_document,
-                    )
+                            with Path(archive_path).open("rb") as f:
+                                document.archive_checksum = hashlib.md5(
+                                    f.read(),
+                                ).hexdigest()

-                # Delete the file only if it was successfully consumed
-                self.log.debug(f"Deleting original file {self.input_doc.original_file}")
-                self.input_doc.original_file.unlink()
-                self.log.debug(f"Deleting working copy {self.working_copy}")
-                self.working_copy.unlink()
-                if self.unmodified_original is not None:  # pragma: no cover
+                    # Don't save with the lock active. Saving will cause the file
+                    # renaming logic to acquire the lock as well.
+                    # This triggers things like file renaming
+                    document.save()
+
+                    if document.root_document_id:
+                        document_updated.send(
+                            sender=self.__class__,
+                            document=document.root_document,
+                        )
+
+                    # Delete the file only if it was successfully consumed
                    self.log.debug(
-                        f"Deleting unmodified original file {self.unmodified_original}",
+                        f"Deleting original file {self.input_doc.original_file}",
                    )
-                    self.unmodified_original.unlink()
+                    self.input_doc.original_file.unlink()
+                    self.log.debug(f"Deleting working copy {self.working_copy}")
+                    self.working_copy.unlink()
+                    if self.unmodified_original is not None:  # pragma: no cover
+                        self.log.debug(
+                            f"Deleting unmodified original file {self.unmodified_original}",
+                        )
+                        self.unmodified_original.unlink()

-                # https://github.com/jonaswinkler/paperless-ng/discussions/1037
-                shadow_file = (
-                    Path(self.input_doc.original_file).parent
-                    / f"._{Path(self.input_doc.original_file).name}"
+                    # https://github.com/jonaswinkler/paperless-ng/discussions/1037
+                    shadow_file = (
+                        Path(self.input_doc.original_file).parent
+                        / f"._{Path(self.input_doc.original_file).name}"
+                    )
+
+                    if Path(shadow_file).is_file():
+                        self.log.debug(f"Deleting shadow file {shadow_file}")
+                        Path(shadow_file).unlink()
+
+            except Exception as e:
+                self._fail(
+                    str(e),
+                    f"The following error occurred while storing document "
+                    f"{self.filename} after parsing: {e}",
+                    exc_info=True,
+                    exception=e,
                )
-
-                if Path(shadow_file).is_file():
-                    self.log.debug(f"Deleting shadow file {shadow_file}")
-                    Path(shadow_file).unlink()
-
-        except Exception as e:
-            self._fail(
-                str(e),
-                f"The following error occurred while storing document "
-                f"{self.filename} after parsing: {e}",
-                exc_info=True,
-                exception=e,
-            )
-        finally:
-            _parser_cleanup(document_parser)
-            tempdir.cleanup()
+            finally:
+                tempdir.cleanup()

        self.run_post_consume_script(document)

--- a/src/documents/management/commands/document_thumbnails.py
+++ b/src/documents/management/commands/document_thumbnails.py
@@ -3,14 +3,18 @@ import shutil

 from documents.management.commands.base import PaperlessCommand
 from documents.models import Document
-from documents.parsers import get_parser_class_for_mime_type
+from paperless.parsers.registry import get_parser_registry

 logger = logging.getLogger("paperless.management.thumbnails")


 def _process_document(doc_id: int) -> None:
    document: Document = Document.objects.get(id=doc_id)
-    parser_class = get_parser_class_for_mime_type(document.mime_type)
+    parser_class = get_parser_registry().get_parser_for_file(
+        document.mime_type,
+        document.original_filename or "",
+        document.source_path,
+    )

    if parser_class is None:
        logger.warning(
@@ -20,18 +24,9 @@ def _process_document(doc_id: int) -> None:
        )
        return

-    parser = parser_class(logging_group=None)
-
-    try:
-        thumb = parser.get_thumbnail(
-            document.source_path,
-            document.mime_type,
-            document.get_public_filename(),
-        )
+    with parser_class() as parser:
+        thumb = parser.get_thumbnail(document.source_path, document.mime_type)
        shutil.move(thumb, document.thumbnail_path)
-    finally:
-        # TODO(stumpylog): Cleanup once all parsers are handled
-        parser.cleanup()


 class Command(PaperlessCommand):
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -3,84 +3,47 @@ from __future__ import annotations
 import logging
 import mimetypes
 import os
-import re
 import shutil
 import subprocess
 import tempfile
-from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING

 from django.conf import settings

 from documents.loggers import LoggingMixin
-from documents.signals import document_consumer_declaration
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
+from paperless.parsers.registry import get_parser_registry

 if TYPE_CHECKING:
    import datetime

-# This regular expression will try to find dates in the document at
-# hand and will match the following formats:
-# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
-# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
-# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
-# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
-# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
-# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
-# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
-# - MONTH ZZZZ, with ZZZZ being 4 digits
-# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
-# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
-# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits
-
-# TODO: isn't there a date parsing library for this?
-
-DATE_REGEX = re.compile(
-    r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
-    re.IGNORECASE,
-)
-
-
 logger = logging.getLogger("paperless.parsing")


-@lru_cache(maxsize=8)
 def is_mime_type_supported(mime_type: str) -> bool:
    """
    Returns True if the mime type is supported, False otherwise
    """
-    return get_parser_class_for_mime_type(mime_type) is not None
+    return get_parser_registry().get_parser_for_file(mime_type, "") is not None


-@lru_cache(maxsize=8)
 def get_default_file_extension(mime_type: str) -> str:
    """
    Returns the default file extension for a mimetype, or
    an empty string if it could not be determined
    """
-    for response in document_consumer_declaration.send(None):
-        parser_declaration = response[1]
-        supported_mime_types = parser_declaration["mime_types"]
-
-        if mime_type in supported_mime_types:
-            return supported_mime_types[mime_type]
+    parser_class = get_parser_registry().get_parser_for_file(mime_type, "")
+    if parser_class is not None:
+        supported = parser_class.supported_mime_types()
+        if mime_type in supported:
+            return supported[mime_type]

    ext = mimetypes.guess_extension(mime_type)
-    if ext:
-        return ext
-    else:
-        return ""
+    return ext if ext else ""


-@lru_cache(maxsize=8)
 def is_file_ext_supported(ext: str) -> bool:
    """
    Returns True if the file extension is supported, False otherwise
@@ -94,44 +57,17 @@ def is_file_ext_supported(ext: str) -> bool:

 def get_supported_file_extensions() -> set[str]:
    extensions = set()
-    for response in document_consumer_declaration.send(None):
-        parser_declaration = response[1]
-        supported_mime_types = parser_declaration["mime_types"]
-
-        for mime_type in supported_mime_types:
+    for parser_class in get_parser_registry().all_parsers():
+        for mime_type, ext in parser_class.supported_mime_types().items():
            extensions.update(mimetypes.guess_all_extensions(mime_type))
            # Python's stdlib might be behind, so also add what the parser
            # says is the default extension
            # This makes image/webp supported on Python < 3.11
-            extensions.add(supported_mime_types[mime_type])
+            extensions.add(ext)

    return extensions


-def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
-    """
-    Returns the best parser (by weight) for the given mimetype or
-    None if no parser exists
-    """
-
-    options = []
-
-    for response in document_consumer_declaration.send(None):
-        parser_declaration = response[1]
-        supported_mime_types = parser_declaration["mime_types"]
-
-        if mime_type in supported_mime_types:
-            options.append(parser_declaration)
-
-    if not options:
-        return None
-
-    best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
-
-    # Return the parser with the highest weight.
-    return best_parser["parser"]
-
-
 def run_convert(
    input_file,
    output_file,
--- a/src/documents/signals/init.py
+++ b/src/documents/signals/init.py
@@ -2,5 +2,4 @@ from django.dispatch import Signal

 document_consumption_started = Signal()
 document_consumption_finished = Signal()
-document_consumer_declaration = Signal()
 document_updated = Signal()
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -52,8 +52,6 @@ from documents.models import StoragePath
 from documents.models import Tag
 from documents.models import WorkflowRun
 from documents.models import WorkflowTrigger
-from documents.parsers import DocumentParser
-from documents.parsers import get_parser_class_for_mime_type
 from documents.plugins.base import ConsumeTaskPlugin
 from documents.plugins.base import ProgressManager
 from documents.plugins.base import StopConsumeTaskError
@@ -65,6 +63,8 @@ from documents.signals.handlers import run_workflows
 from documents.signals.handlers import send_websocket_document_updated
 from documents.workflows.utils import get_workflows_for_trigger
 from paperless.config import AIConfig
+from paperless.parsers import ParserContext
+from paperless.parsers.registry import get_parser_registry
 from paperless_ai.indexing import llm_index_add_or_update_document
 from paperless_ai.indexing import llm_index_remove_document
 from paperless_ai.indexing import update_llm_index
@@ -304,7 +304,11 @@ def update_document_content_maybe_archive_file(document_id) -> None:

    mime_type = document.mime_type

-    parser_class: type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
+    parser_class = get_parser_registry().get_parser_for_file(
+        mime_type,
+        document.original_filename or "",
+        document.source_path,
+    )

    if not parser_class:
        logger.error(
@@ -313,98 +317,92 @@ def update_document_content_maybe_archive_file(document_id) -> None:
        )
        return

-    parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
+    with parser_class() as parser:
+        parser.configure(ParserContext())

-    try:
-        parser.parse(document.source_path, mime_type, document.get_public_filename())
+        try:
+            parser.parse(document.source_path, mime_type)

-        thumbnail = parser.get_thumbnail(
-            document.source_path,
-            mime_type,
-            document.get_public_filename(),
-        )
+            thumbnail = parser.get_thumbnail(document.source_path, mime_type)

-        with transaction.atomic():
-            oldDocument = Document.objects.get(pk=document.pk)
-            if parser.get_archive_path():
-                with Path(parser.get_archive_path()).open("rb") as f:
-                    checksum = hashlib.md5(f.read()).hexdigest()
-                # I'm going to save first so that in case the file move
-                # fails, the database is rolled back.
-                # We also don't use save() since that triggers the filehandling
-                # logic, and we don't want that yet (file not yet in place)
-                document.archive_filename = generate_unique_filename(
-                    document,
-                    archive_filename=True,
-                )
-                Document.objects.filter(pk=document.pk).update(
-                    archive_checksum=checksum,
-                    content=parser.get_text(),
-                    archive_filename=document.archive_filename,
-                )
-                newDocument = Document.objects.get(pk=document.pk)
-                if settings.AUDIT_LOG_ENABLED:
-                    LogEntry.objects.log_create(
-                        instance=oldDocument,
-                        changes={
-                            "content": [oldDocument.content, newDocument.content],
-                            "archive_checksum": [
-                                oldDocument.archive_checksum,
-                                newDocument.archive_checksum,
-                            ],
-                            "archive_filename": [
-                                oldDocument.archive_filename,
-                                newDocument.archive_filename,
-                            ],
-                        },
-                        additional_data={
-                            "reason": "Update document content",
-                        },
-                        action=LogEntry.Action.UPDATE,
-                    )
-            else:
-                Document.objects.filter(pk=document.pk).update(
-                    content=parser.get_text(),
-                )
-
-                if settings.AUDIT_LOG_ENABLED:
-                    LogEntry.objects.log_create(
-                        instance=oldDocument,
-                        changes={
-                            "content": [oldDocument.content, parser.get_text()],
-                        },
-                        additional_data={
-                            "reason": "Update document content",
-                        },
-                        action=LogEntry.Action.UPDATE,
-                    )
-
-            with FileLock(settings.MEDIA_LOCK):
+            with transaction.atomic():
+                oldDocument = Document.objects.get(pk=document.pk)
                if parser.get_archive_path():
-                    create_source_path_directory(document.archive_path)
-                    shutil.move(parser.get_archive_path(), document.archive_path)
-                shutil.move(thumbnail, document.thumbnail_path)
+                    with Path(parser.get_archive_path()).open("rb") as f:
+                        checksum = hashlib.md5(f.read()).hexdigest()
+                    # I'm going to save first so that in case the file move
+                    # fails, the database is rolled back.
+                    # We also don't use save() since that triggers the filehandling
+                    # logic, and we don't want that yet (file not yet in place)
+                    document.archive_filename = generate_unique_filename(
+                        document,
+                        archive_filename=True,
+                    )
+                    Document.objects.filter(pk=document.pk).update(
+                        archive_checksum=checksum,
+                        content=parser.get_text(),
+                        archive_filename=document.archive_filename,
+                    )
+                    newDocument = Document.objects.get(pk=document.pk)
+                    if settings.AUDIT_LOG_ENABLED:
+                        LogEntry.objects.log_create(
+                            instance=oldDocument,
+                            changes={
+                                "content": [oldDocument.content, newDocument.content],
+                                "archive_checksum": [
+                                    oldDocument.archive_checksum,
+                                    newDocument.archive_checksum,
+                                ],
+                                "archive_filename": [
+                                    oldDocument.archive_filename,
+                                    newDocument.archive_filename,
+                                ],
+                            },
+                            additional_data={
+                                "reason": "Update document content",
+                            },
+                            action=LogEntry.Action.UPDATE,
+                        )
+                else:
+                    Document.objects.filter(pk=document.pk).update(
+                        content=parser.get_text(),
+                    )

-        document.refresh_from_db()
-        logger.info(
-            f"Updating index for document {document_id} ({document.archive_checksum})",
-        )
-        with index.open_index_writer() as writer:
-            index.update_document(writer, document)
+                    if settings.AUDIT_LOG_ENABLED:
+                        LogEntry.objects.log_create(
+                            instance=oldDocument,
+                            changes={
+                                "content": [oldDocument.content, parser.get_text()],
+                            },
+                            additional_data={
+                                "reason": "Update document content",
+                            },
+                            action=LogEntry.Action.UPDATE,
+                        )

-        ai_config = AIConfig()
-        if ai_config.llm_index_enabled:
-            llm_index_add_or_update_document(document)
+                with FileLock(settings.MEDIA_LOCK):
+                    if parser.get_archive_path():
+                        create_source_path_directory(document.archive_path)
+                        shutil.move(parser.get_archive_path(), document.archive_path)
+                    shutil.move(thumbnail, document.thumbnail_path)

-        clear_document_caches(document.pk)
+            document.refresh_from_db()
+            logger.info(
+                f"Updating index for document {document_id} ({document.archive_checksum})",
+            )
+            with index.open_index_writer() as writer:
+                index.update_document(writer, document)

-    except Exception:
-        logger.exception(
-            f"Error while parsing document {document} (ID: {document_id})",
-        )
-    finally:
-        # TODO(stumpylog): Cleanup once all parsers are handled
-        parser.cleanup()
+            ai_config = AIConfig()
+            if ai_config.llm_index_enabled:
+                llm_index_add_or_update_document(document)
+
+            clear_document_caches(document.pk)
+
+        except Exception:
+            logger.exception(
+                f"Error while parsing document {document} (ID: {document_id})",
+            )


@shared_task
--- a/src/documents/tests/test_api_status.py
+++ b/src/documents/tests/test_api_status.py
@@ -101,13 +101,17 @@ class TestSystemStatus(APITestCase):
            - The response contains the correct install type
        """
        self.client.force_login(self.user)
-        os.environ["PNGX_CONTAINERIZED"] = "1"
-        response = self.client.get(self.ENDPOINT)
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        self.assertEqual(response.data["install_type"], "docker")
-        os.environ["KUBERNETES_SERVICE_HOST"] = "http://localhost"
-        response = self.client.get(self.ENDPOINT)
-        self.assertEqual(response.data["install_type"], "kubernetes")
+        with mock.patch.dict(os.environ, {"PNGX_CONTAINERIZED": "1"}, clear=False):
+            response = self.client.get(self.ENDPOINT)
+            self.assertEqual(response.status_code, status.HTTP_200_OK)
+            self.assertEqual(response.data["install_type"], "docker")
+        with mock.patch.dict(
+            os.environ,
+            {"PNGX_CONTAINERIZED": "1", "KUBERNETES_SERVICE_HOST": "http://localhost"},
+            clear=False,
+        ):
+            response = self.client.get(self.ENDPOINT)
+            self.assertEqual(response.data["install_type"], "kubernetes")

    @mock.patch("redis.Redis.execute_command")
    def test_system_status_redis_ping(self, mock_ping) -> None:
--- a/src/documents/tests/test_checks.py
+++ b/src/documents/tests/test_checks.py
@@ -13,8 +13,10 @@ class TestDocumentChecks(TestCase):
    def test_parser_check(self) -> None:
        self.assertEqual(parser_check(None), [])

-        with mock.patch("documents.checks.document_consumer_declaration.send") as m:
-            m.return_value = []
+        with mock.patch("documents.checks.get_parser_registry") as mock_registry_fn:
+            mock_registry = mock.MagicMock()
+            mock_registry.all_parsers.return_value = []
+            mock_registry_fn.return_value = mock_registry

            self.assertEqual(
                parser_check(None),
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -27,7 +27,6 @@ from documents.models import Document
 from documents.models import DocumentType
 from documents.models import StoragePath
 from documents.models import Tag
-from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.plugins.helpers import ProgressStatusOptions
 from documents.tasks import sanity_check
@@ -36,65 +35,106 @@ from documents.tests.utils import DummyProgressManager
 from documents.tests.utils import FileSystemAssertsMixin
 from documents.tests.utils import GetConsumerMixin
 from paperless_mail.models import MailRule
-from paperless_mail.parsers import MailDocumentParser


-class _BaseTestParser(DocumentParser):
-    def get_settings(self) -> None:
-        """
-        This parser does not implement additional settings yet
-        """
+class _BaseNewStyleParser:
+    """Minimal ParserProtocol implementation for use in consumer tests."""
+
+    name: str = "test-parser"
+    version: str = "0.1"
+    author: str = "test"
+    url: str = "test"
+
+    @classmethod
+    def supported_mime_types(cls) -> dict:
+        return {
+            "application/pdf": ".pdf",
+            "image/png": ".png",
+            "message/rfc822": ".eml",
+        }
+
+    @classmethod
+    def score(cls, mime_type: str, filename: str, path=None):
+        return 0 if mime_type in cls.supported_mime_types() else None
+
+    @property
+    def can_produce_archive(self) -> bool:
+        return True
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        return False
+
+    def __init__(self) -> None:
+        self._tmpdir: Path | None = None
+        self._text: str | None = None
+        self._archive: Path | None = None
+        self._thumb: Path | None = None
+
+    def __enter__(self):
+        self._tmpdir = Path(
+            tempfile.mkdtemp(prefix="paperless-test-", dir=settings.SCRATCH_DIR),
+        )
+        _, thumb = tempfile.mkstemp(suffix=".webp", dir=self._tmpdir)
+        self._thumb = Path(thumb)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        if self._tmpdir and self._tmpdir.exists():
+            shutil.rmtree(self._tmpdir, ignore_errors=True)
+
+    def configure(self, context) -> None:
+        pass
+
+    def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
+        raise NotImplementedError
+
+    def get_text(self) -> str | None:
+        return self._text
+
+    def get_date(self):
        return None

+    def get_archive_path(self):
+        return self._archive

-class DummyParser(_BaseTestParser):
-    def __init__(self, logging_group, scratch_dir, archive_path) -> None:
-        super().__init__(logging_group, None)
-        _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
-        self.archive_path = archive_path
+    def get_thumbnail(self, document_path, mime_type) -> Path:
+        return self._thumb

-    def get_thumbnail(self, document_path, mime_type, file_name=None):
-        return self.fake_thumb
+    def get_page_count(self, document_path, mime_type):
+        return None

-    def parse(self, document_path, mime_type, file_name=None) -> None:
-        self.text = "The Text"
+    def extract_metadata(self, document_path, mime_type) -> list:
+        return []


-class CopyParser(_BaseTestParser):
-    def get_thumbnail(self, document_path, mime_type, file_name=None):
-        return self.fake_thumb
+class DummyParser(_BaseNewStyleParser):
+    _ARCHIVE_SRC = (
+        Path(__file__).parent / "samples" / "documents" / "archive" / "0000001.pdf"
+    )

-    def __init__(self, logging_group, progress_callback=None) -> None:
-        super().__init__(logging_group, progress_callback)
-        _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir)
-
-    def parse(self, document_path, mime_type, file_name=None) -> None:
-        self.text = "The text"
-        self.archive_path = Path(self.tempdir / "archive.pdf")
-        shutil.copy(document_path, self.archive_path)
+    def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
+        self._text = "The Text"
+        if produce_archive and self._tmpdir:
+            self._archive = self._tmpdir / "archive.pdf"
+            shutil.copy(self._ARCHIVE_SRC, self._archive)


-class FaultyParser(_BaseTestParser):
-    def __init__(self, logging_group, scratch_dir) -> None:
-        super().__init__(logging_group)
-        _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
+class CopyParser(_BaseNewStyleParser):
+    def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
+        self._text = "The text"
+        if produce_archive and self._tmpdir:
+            self._archive = self._tmpdir / "archive.pdf"
+            shutil.copy(document_path, self._archive)

-    def get_thumbnail(self, document_path, mime_type, file_name=None):
-        return self.fake_thumb

-    def parse(self, document_path, mime_type, file_name=None):
+class FaultyParser(_BaseNewStyleParser):
+    def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
        raise ParseError("Does not compute.")


-class FaultyGenericExceptionParser(_BaseTestParser):
-    def __init__(self, logging_group, scratch_dir) -> None:
-        super().__init__(logging_group)
-        _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
-
-    def get_thumbnail(self, document_path, mime_type, file_name=None):
-        return self.fake_thumb
-
-    def parse(self, document_path, mime_type, file_name=None):
+class FaultyGenericExceptionParser(_BaseNewStyleParser):
+    def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
        raise Exception("Generic exception.")


@@ -148,38 +188,12 @@ class TestConsumer(
        self.assertEqual(payload["data"]["max_progress"], last_progress_max)
        self.assertEqual(payload["data"]["status"], last_status)

-    def make_dummy_parser(self, logging_group, progress_callback=None):
-        return DummyParser(
-            logging_group,
-            self.dirs.scratch_dir,
-            self.get_test_archive_file(),
-        )
-
-    def make_faulty_parser(self, logging_group, progress_callback=None):
-        return FaultyParser(logging_group, self.dirs.scratch_dir)
-
-    def make_faulty_generic_exception_parser(
-        self,
-        logging_group,
-        progress_callback=None,
-    ):
-        return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir)
-
    def setUp(self) -> None:
        super().setUp()

-        patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
-        m = patcher.start()
-        m.return_value = [
-            (
-                None,
-                {
-                    "parser": self.make_dummy_parser,
-                    "mime_types": {"application/pdf": ".pdf"},
-                    "weight": 0,
-                },
-            ),
-        ]
+        patcher = mock.patch("documents.consumer.get_parser_registry")
+        mock_registry = patcher.start()
+        mock_registry.return_value.get_parser_for_file.return_value = DummyParser
        self.addCleanup(patcher.stop)

    def get_test_file(self):
@@ -548,9 +562,9 @@ class TestConsumer(
            ) as consumer:
                consumer.run()

-    @mock.patch("documents.parsers.document_consumer_declaration.send")
+    @mock.patch("documents.consumer.get_parser_registry")
    def testNoParsers(self, m) -> None:
-        m.return_value = []
+        m.return_value.get_parser_for_file.return_value = None

        with self.assertRaisesMessage(
            ConsumerError,
@@ -561,18 +575,9 @@ class TestConsumer(

        self._assert_first_last_send_progress(last_status="FAILED")

-    @mock.patch("documents.parsers.document_consumer_declaration.send")
+    @mock.patch("documents.consumer.get_parser_registry")
    def testFaultyParser(self, m) -> None:
-        m.return_value = [
-            (
-                None,
-                {
-                    "parser": self.make_faulty_parser,
-                    "mime_types": {"application/pdf": ".pdf"},
-                    "weight": 0,
-                },
-            ),
-        ]
+        m.return_value.get_parser_for_file.return_value = FaultyParser

        with self.get_consumer(self.get_test_file()) as consumer:
            with self.assertRaisesMessage(
@@ -583,18 +588,9 @@ class TestConsumer(

        self._assert_first_last_send_progress(last_status="FAILED")

-    @mock.patch("documents.parsers.document_consumer_declaration.send")
+    @mock.patch("documents.consumer.get_parser_registry")
    def testGenericParserException(self, m) -> None:
-        m.return_value = [
-            (
-                None,
-                {
-                    "parser": self.make_faulty_generic_exception_parser,
-                    "mime_types": {"application/pdf": ".pdf"},
-                    "weight": 0,
-                },
-            ),
-        ]
+        m.return_value.get_parser_for_file.return_value = FaultyGenericExceptionParser

        with self.get_consumer(self.get_test_file()) as consumer:
            with self.assertRaisesMessage(
@@ -1018,7 +1014,7 @@ class TestConsumer(
        self._assert_first_last_send_progress()

    @override_settings(FILENAME_FORMAT="{title}")
-    @mock.patch("documents.parsers.document_consumer_declaration.send")
+    @mock.patch("documents.consumer.get_parser_registry")
    def test_similar_filenames(self, m) -> None:
        shutil.copy(
            Path(__file__).parent / "samples" / "simple.pdf",
@@ -1032,16 +1028,7 @@ class TestConsumer(
            Path(__file__).parent / "samples" / "simple-noalpha.png",
            settings.CONSUMPTION_DIR / "simple.png.pdf",
        )
-        m.return_value = [
-            (
-                None,
-                {
-                    "parser": CopyParser,
-                    "mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
-                    "weight": 0,
-                },
-            ),
-        ]
+        m.return_value.get_parser_for_file.return_value = CopyParser

        with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer:
            consumer.run()
@@ -1069,8 +1056,10 @@ class TestConsumer(

        sanity_check()

+    @mock.patch("documents.consumer.get_parser_registry")
    @mock.patch("documents.consumer.run_subprocess")
-    def test_try_to_clean_invalid_pdf(self, m) -> None:
+    def test_try_to_clean_invalid_pdf(self, m, mock_registry) -> None:
+        mock_registry.return_value.get_parser_for_file.return_value = None
        shutil.copy(
            Path(__file__).parent / "samples" / "invalid_pdf.pdf",
            settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
@@ -1091,11 +1080,11 @@ class TestConsumer(
            self.assertEqual(command[1], "--replace-input")

    @mock.patch("paperless_mail.models.MailRule.objects.get")
-    @mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
-    @mock.patch("documents.parsers.document_consumer_declaration.send")
+    @mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
+    @mock.patch("documents.consumer.get_parser_registry")
    def test_mail_parser_receives_mailrule(
        self,
-        mock_consumer_declaration_send: mock.Mock,
+        mock_get_parser_registry: mock.Mock,
        mock_mail_parser_parse: mock.Mock,
        mock_mailrule_get: mock.Mock,
    ) -> None:
@@ -1107,25 +1096,21 @@ class TestConsumer(
        THEN:
            - The mail parser should receive the mail rule
        """
-        mock_consumer_declaration_send.return_value = [
-            (
-                None,
-                {
-                    "parser": MailDocumentParser,
-                    "mime_types": {"message/rfc822": ".eml"},
-                    "weight": 0,
-                },
-            ),
-        ]
+        from paperless.parsers.mail import MailDocumentParser
+
+        mock_get_parser_registry.return_value.get_parser_for_file.return_value = (
+            MailDocumentParser
+        )
        mock_mailrule_get.return_value = mock.Mock(
            pdf_layout=MailRule.PdfLayout.HTML_ONLY,
        )
        with self.get_consumer(
            filepath=(
                Path(__file__).parent.parent.parent
-                / Path("paperless_mail")
+                / Path("paperless")
                / Path("tests")
                / Path("samples")
+                / Path("mail")
            ).resolve()
            / "html.eml",
            source=DocumentSource.MailFetch,
@@ -1136,12 +1121,10 @@ class TestConsumer(
                ConsumerError,
            ):
                consumer.run()
-                mock_mail_parser_parse.assert_called_once_with(
-                    consumer.working_copy,
-                    "message/rfc822",
-                    file_name="sample.pdf",
-                    mailrule=mock_mailrule_get.return_value,
-                )
+            mock_mail_parser_parse.assert_called_once_with(
+                consumer.working_copy,
+                "message/rfc822",
+            )


@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -1,130 +1,14 @@
-from tempfile import TemporaryDirectory
-from unittest import mock
-
-from django.apps import apps
 from django.test import TestCase
 from django.test import override_settings

 from documents.parsers import get_default_file_extension
-from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import get_supported_file_extensions
 from documents.parsers import is_file_ext_supported
+from paperless.parsers.registry import get_parser_registry
+from paperless.parsers.registry import reset_parser_registry
+from paperless.parsers.tesseract import RasterisedDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
-from paperless_tesseract.parsers import RasterisedDocumentParser
-
-
-class TestParserDiscovery(TestCase):
-    @mock.patch("documents.parsers.document_consumer_declaration.send")
-    def test_get_parser_class_1_parser(self, m, *args) -> None:
-        """
-        GIVEN:
-            - Parser declared for a given mimetype
-        WHEN:
-            - Attempt to get parser for the mimetype
-        THEN:
-            - Declared parser class is returned
-        """
-
-        class DummyParser:
-            pass
-
-        m.return_value = (
-            (
-                None,
-                {
-                    "weight": 0,
-                    "parser": DummyParser,
-                    "mime_types": {"application/pdf": ".pdf"},
-                },
-            ),
-        )
-
-        self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
-
-    @mock.patch("documents.parsers.document_consumer_declaration.send")
-    def test_get_parser_class_n_parsers(self, m, *args) -> None:
-        """
-        GIVEN:
-            - Two parsers declared for a given mimetype
-            - Second parser has a higher weight
-        WHEN:
-            - Attempt to get parser for the mimetype
-        THEN:
-            - Second parser class is returned
-        """
-
-        class DummyParser1:
-            pass
-
-        class DummyParser2:
-            pass
-
-        m.return_value = (
-            (
-                None,
-                {
-                    "weight": 0,
-                    "parser": DummyParser1,
-                    "mime_types": {"application/pdf": ".pdf"},
-                },
-            ),
-            (
-                None,
-                {
-                    "weight": 1,
-                    "parser": DummyParser2,
-                    "mime_types": {"application/pdf": ".pdf"},
-                },
-            ),
-        )
-
-        self.assertEqual(
-            get_parser_class_for_mime_type("application/pdf"),
-            DummyParser2,
-        )
-
-    @mock.patch("documents.parsers.document_consumer_declaration.send")
-    def test_get_parser_class_0_parsers(self, m, *args) -> None:
-        """
-        GIVEN:
-            - No parsers are declared
-        WHEN:
-            - Attempt to get parser for the mimetype
-        THEN:
-            - No parser class is returned
-        """
-        m.return_value = []
-        with TemporaryDirectory():
-            self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
-
-    @mock.patch("documents.parsers.document_consumer_declaration.send")
-    def test_get_parser_class_no_valid_parser(self, m, *args) -> None:
-        """
-        GIVEN:
-            - No parser declared for a given mimetype
-            - Parser declared for a different mimetype
-        WHEN:
-            - Attempt to get parser for the given mimetype
-        THEN:
-            - No parser class is returned
-        """
-
-        class DummyParser:
-            pass
-
-        m.return_value = (
-            (
-                None,
-                {
-                    "weight": 0,
-                    "parser": DummyParser,
-                    "mime_types": {"application/pdf": ".pdf"},
-                },
-            ),
-        )
-
-        self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))


 class TestParserAvailability(TestCase):
@@ -151,7 +35,7 @@ class TestParserAvailability(TestCase):
            self.assertIn(ext, supported_exts)
            self.assertEqual(get_default_file_extension(mime_type), ext)
            self.assertIsInstance(
-                get_parser_class_for_mime_type(mime_type)(logging_group=None),
+                get_parser_registry().get_parser_for_file(mime_type, "")(),
                RasterisedDocumentParser,
            )

@@ -175,7 +59,7 @@ class TestParserAvailability(TestCase):
            self.assertIn(ext, supported_exts)
            self.assertEqual(get_default_file_extension(mime_type), ext)
            self.assertIsInstance(
-                get_parser_class_for_mime_type(mime_type)(logging_group=None),
+                get_parser_registry().get_parser_for_file(mime_type, "")(),
                TextDocumentParser,
            )

@@ -198,22 +82,23 @@ class TestParserAvailability(TestCase):
            ),
        ]

-        # Force the app ready to notice the settings override
-        with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
-            app = apps.get_app_config("paperless_tika")
-            app.ready()
+        self.addCleanup(reset_parser_registry)
+
+        # Reset and rebuild the registry with Tika enabled.
+        with override_settings(TIKA_ENABLED=True):
+            reset_parser_registry()
            supported_exts = get_supported_file_extensions()

-        for mime_type, ext in supported_mimes_and_exts:
-            self.assertIn(ext, supported_exts)
-            self.assertEqual(get_default_file_extension(mime_type), ext)
-            self.assertIsInstance(
-                get_parser_class_for_mime_type(mime_type)(logging_group=None),
-                TikaDocumentParser,
-            )
+            for mime_type, ext in supported_mimes_and_exts:
+                self.assertIn(ext, supported_exts)
+                self.assertEqual(get_default_file_extension(mime_type), ext)
+                self.assertIsInstance(
+                    get_parser_registry().get_parser_for_file(mime_type, "")(),
+                    TikaDocumentParser,
+                )

    def test_no_parser_for_mime(self) -> None:
-        self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
+        self.assertIsNone(get_parser_registry().get_parser_for_file("text/sdgsdf", ""))

    def test_default_extension(self) -> None:
        # Test no parser declared still returns a an extension
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -7,7 +7,6 @@ import tempfile
 import zipfile
 from collections import defaultdict
 from collections import deque
-from contextlib import nullcontext
 from datetime import datetime
 from pathlib import Path
 from time import mktime
@@ -158,7 +157,6 @@ from documents.models import UiSettings
 from documents.models import Workflow
 from documents.models import WorkflowAction
 from documents.models import WorkflowTrigger
-from documents.parsers import get_parser_class_for_mime_type
 from documents.permissions import AcknowledgeTasksPermissions
 from documents.permissions import PaperlessAdminPermissions
 from documents.permissions import PaperlessNotePermissions
@@ -226,7 +224,7 @@ from paperless.celery import app as celery_app
 from paperless.config import AIConfig
 from paperless.config import GeneralConfig
 from paperless.models import ApplicationConfiguration
-from paperless.parsers import ParserProtocol
+from paperless.parsers.registry import get_parser_registry
 from paperless.serialisers import GroupSerializer
 from paperless.serialisers import UserSerializer
 from paperless.views import StandardPagination
@@ -1083,17 +1081,17 @@ class DocumentViewSet(
        if not Path(file).is_file():
            return None

-        parser_class = get_parser_class_for_mime_type(mime_type)
+        parser_class = get_parser_registry().get_parser_for_file(
+            mime_type,
+            Path(file).name,
+            Path(file),
+        )
        if parser_class:
-            parser = parser_class(progress_callback=None, logging_group=None)
-            cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
-
            try:
-                with cm:
+                with parser_class() as parser:
                    return parser.extract_metadata(file, mime_type)
            except Exception:  # pragma: no cover
                logger.exception(f"Issue getting metadata for {file}")
-                # TODO: cover GPG errors, remove later.
                return []
        else:  # pragma: no cover
            logger.warning(f"No parser for {mime_type}")
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -3,6 +3,7 @@ import os
 import pwd
 import shutil
 import stat
+import subprocess
 from pathlib import Path

 from django.conf import settings
@@ -299,3 +300,62 @@ def check_deprecated_db_settings(
        )

    return warnings
+
+
+@register()
+def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
+    if settings.REMOTE_OCR_ENGINE == "azureai" and not (
+        settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
+    ):
+        return [
+            Error(
+                "Azure AI remote parser requires endpoint and API key to be configured.",
+            ),
+        ]
+
+    return []
+
+
+def get_tesseract_langs():
+    proc = subprocess.run(
+        [shutil.which("tesseract"), "--list-langs"],
+        capture_output=True,
+    )
+
+    # Decode bytes to string, split on newlines, trim out the header
+    proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
+
+    return [x.strip() for x in proc_lines]
+
+
+@register()
+def check_default_language_available(app_configs, **kwargs):
+    errs = []
+
+    if not settings.OCR_LANGUAGE:
+        errs.append(
+            Warning(
+                "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
+                "This means that tesseract will fallback to english.",
+            ),
+        )
+        return errs
+
+    # binaries_check in paperless will check and report if this doesn't exist
+    # So skip trying to do anything here and let that handle missing binaries
+    if shutil.which("tesseract") is not None:
+        installed_langs = get_tesseract_langs()
+
+        specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")]
+
+        for lang in specified_langs:
+            if lang not in installed_langs:
+                errs.append(
+                    Error(
+                        f"The selected ocr language {lang} is "
+                        f"not installed. Paperless cannot OCR your documents "
+                        f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
+                    ),
+                )
+
+    return errs
--- a/src/paperless/parsers/init.py
+++ b/src/paperless/parsers/init.py
@@ -35,6 +35,7 @@ Usage example (third-party parser)::

 from __future__ import annotations

+from dataclasses import dataclass
 from typing import TYPE_CHECKING
 from typing import Protocol
 from typing import Self
@@ -48,6 +49,7 @@ if TYPE_CHECKING:

 __all__ = [
    "MetadataEntry",
+    "ParserContext",
    "ParserProtocol",
 ]

@@ -73,6 +75,44 @@ class MetadataEntry(TypedDict):
    """String representation of the field value."""


+@dataclass(frozen=True, slots=True)
+class ParserContext:
+    """Immutable context passed to a parser before parse().
+
+    The consumer assembles this from the ingestion event and Django
+    settings, then calls ``parser.configure(context)`` before
+    ``parser.parse()``.  Parsers read only the fields relevant to them;
+    unneeded fields are ignored.
+
+    ``frozen=True`` prevents accidental mutation after the consumer
+    hands the context off.  ``slots=True`` keeps instances lightweight.
+
+    Fields
+    ------
+    mailrule_id : int | None
+        Primary key of the ``MailRule`` that triggered this ingestion,
+        or ``None`` when the document did not arrive via a mail rule.
+        Used by ``MailDocumentParser`` to select the PDF layout.
+
+    Notes
+    -----
+    Future fields (not yet implemented):
+
+    * ``output_type`` — PDF/A variant for archive generation
+      (replaces ``settings.OCR_OUTPUT_TYPE`` reads inside parsers).
+    * ``ocr_mode`` — skip-text, redo, force, etc.
+      (replaces ``settings.OCR_MODE`` reads inside parsers).
+    * ``ocr_language`` — Tesseract language string.
+      (replaces ``settings.OCR_LANGUAGE`` reads inside parsers).
+
+    When those fields are added the consumer will read from Django
+    settings once and populate them here, decoupling parsers from
+    ``settings.*`` entirely.
+    """
+
+    mailrule_id: int | None = None
+
+
@runtime_checkable
 class ParserProtocol(Protocol):
    """Structural contract for all Paperless-ngx document parsers.
@@ -191,6 +231,21 @@ class ParserProtocol(Protocol):
    # Core parsing interface
    # ------------------------------------------------------------------

+    def configure(self, context: ParserContext) -> None:
+        """Apply source context before parse().
+
+        Called by the consumer after instantiation and before parse().
+        The default implementation is a no-op; parsers override only the
+        fields they need.
+
+        Parameters
+        ----------
+        context:
+            Immutable context assembled by the consumer for this
+            specific ingestion event.
+        """
+        ...
+
    def parse(
        self,
        document_path: Path,
--- a/src/paperless/parsers/mail.py
+++ b/src/paperless/parsers/mail.py
@@ -0,0 +1,834 @@
+"""
+Built-in mail document parser.
+
+Handles message/rfc822 (EML) MIME type by:
+- Parsing the email using imap_tools
+- Generating a PDF via Gotenberg (for display and archive)
+- Extracting text via Tika for HTML content
+- Extracting metadata from email headers
+
+The parser always produces a PDF because EML files cannot be rendered
+natively in a browser (requires_pdf_rendition=True).
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import shutil
+import tempfile
+from html import escape
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Self
+
+from bleach import clean
+from bleach import linkify
+from django.conf import settings
+from django.utils import timezone
+from django.utils.timezone import is_naive
+from django.utils.timezone import make_aware
+from gotenberg_client import GotenbergClient
+from gotenberg_client.constants import A4
+from gotenberg_client.options import Measurement
+from gotenberg_client.options import MeasurementUnitType
+from gotenberg_client.options import PageMarginsType
+from gotenberg_client.options import PdfAFormat
+from humanize import naturalsize
+from imap_tools import MailAttachment
+from imap_tools import MailMessage
+from tika_client import TikaClient
+
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from paperless.models import OutputTypeChoices
+from paperless.version import __full_version_str__
+from paperless_mail.models import MailRule
+
+if TYPE_CHECKING:
+    import datetime
+    from types import TracebackType
+
+    from paperless.parsers import MetadataEntry
+    from paperless.parsers import ParserContext
+
+logger = logging.getLogger("paperless.parsing.mail")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+    "message/rfc822": ".eml",
+}
+
+
+class MailDocumentParser:
+    """Parse .eml email files for Paperless-ngx.
+
+    Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
+    and sends the HTML part to a Tika server for text extraction.  Because
+    EML files cannot be rendered natively in a browser, the parser always
+    produces a PDF rendition (requires_pdf_rendition=True).
+
+    Pass a ``ParserContext`` to ``configure()`` before ``parse()`` to
+    apply mail-rule-specific PDF layout options:
+
+        parser.configure(ParserContext(mailrule_id=rule.pk))
+        parser.parse(path, mime_type)
+
+    Class attributes
+    ----------------
+    name : str
+        Human-readable parser name.
+    version : str
+        Semantic version string, kept in sync with Paperless-ngx releases.
+    author : str
+        Maintainer name.
+    url : str
+        Issue tracker / source URL.
+    """
+
+    name: str = "Paperless-ngx Mail Parser"
+    version: str = __full_version_str__
+    author: str = "Paperless-ngx Contributors"
+    url: str = "https://github.com/paperless-ngx/paperless-ngx"
+
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        """Return the MIME types this parser handles.
+
+        Returns
+        -------
+        dict[str, str]
+            Mapping of MIME type to preferred file extension.
+        """
+        return _SUPPORTED_MIME_TYPES
+
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        """Return the priority score for handling this file.
+
+        Parameters
+        ----------
+        mime_type:
+            Detected MIME type of the file.
+        filename:
+            Original filename including extension.
+        path:
+            Optional filesystem path. Not inspected by this parser.
+
+        Returns
+        -------
+        int | None
+            10 if the MIME type is supported, otherwise None.
+        """
+        if mime_type in _SUPPORTED_MIME_TYPES:
+            return 10
+        return None
+
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def can_produce_archive(self) -> bool:
+        """Whether this parser can produce a searchable PDF archive copy.
+
+        Returns
+        -------
+        bool
+            Always False — the mail parser produces a display PDF
+            (requires_pdf_rendition=True), not an optional OCR archive.
+        """
+        return False
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        """Whether the parser must produce a PDF for the frontend to display.
+
+        Returns
+        -------
+        bool
+            Always True — EML files cannot be rendered natively in a browser,
+            so a PDF conversion is always required for display.
+        """
+        return True
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def __init__(self, logging_group: object = None) -> None:
+        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+        self._tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
+        self._text: str | None = None
+        self._date: datetime.datetime | None = None
+        self._archive_path: Path | None = None
+        self._mailrule_id: int | None = None
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        logger.debug("Cleaning up temporary directory %s", self._tempdir)
+        shutil.rmtree(self._tempdir, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def configure(self, context: ParserContext) -> None:
+        self._mailrule_id = context.mailrule_id
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
+        """Parse the given .eml into formatted text and a PDF archive.
+
+        Call ``configure(ParserContext(mailrule_id=...))`` before this method
+        to apply mail-rule-specific PDF layout options.  The ``produce_archive``
+        flag is accepted for protocol compatibility but is always honoured —
+        the mail parser always produces a PDF since EML files cannot be
+        displayed natively.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the .eml file.
+        mime_type:
+            Detected MIME type of the document (should be "message/rfc822").
+        produce_archive:
+            Accepted for protocol compatibility. The PDF rendition is always
+            produced since EML files cannot be displayed natively in a browser.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If the file cannot be parsed or PDF generation fails.
+        """
+
+        def strip_text(text: str) -> str:
+            """Reduces the spacing of the given text string."""
+            text = re.sub(r"\s+", " ", text)
+            text = re.sub(r"(\n *)+", "\n", text)
+            return text.strip()
+
+        def build_formatted_text(mail_message: MailMessage) -> str:
+            """Constructs a formatted string based on the given email."""
+            fmt_text = f"Subject: {mail_message.subject}\n\n"
+            fmt_text += f"From: {mail_message.from_values.full if mail_message.from_values else ''}\n\n"
+            to_list = [address.full for address in mail_message.to_values]
+            fmt_text += f"To: {', '.join(to_list)}\n\n"
+            if mail_message.cc_values:
+                fmt_text += (
+                    f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
+                )
+            if mail_message.bcc_values:
+                fmt_text += (
+                    f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
+                )
+            if mail_message.attachments:
+                att = []
+                for a in mail.attachments:
+                    attachment_size = naturalsize(a.size, binary=True, format="%.2f")
+                    att.append(
+                        f"{a.filename} ({attachment_size})",
+                    )
+                fmt_text += f"Attachments: {', '.join(att)}\n\n"
+
+            if mail.html:
+                fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
+
+            fmt_text += f"\n\n{strip_text(mail.text)}"
+
+            return fmt_text
+
+        logger.debug("Parsing file %s into an email", document_path.name)
+        mail = self.parse_file_to_message(document_path)
+
+        logger.debug("Building formatted text from email")
+        self._text = build_formatted_text(mail)
+
+        if is_naive(mail.date):
+            self._date = make_aware(mail.date)
+        else:
+            self._date = mail.date
+
+        logger.debug("Creating a PDF from the email")
+        if self._mailrule_id:
+            rule = MailRule.objects.get(pk=self._mailrule_id)
+            self._archive_path = self.generate_pdf(
+                mail,
+                MailRule.PdfLayout(rule.pdf_layout),
+            )
+        else:
+            self._archive_path = self.generate_pdf(mail)
+
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
+
+    def get_text(self) -> str | None:
+        """Return the plain-text content extracted during parse.
+
+        Returns
+        -------
+        str | None
+            Extracted text, or None if parse has not been called yet.
+        """
+        return self._text
+
+    def get_date(self) -> datetime.datetime | None:
+        """Return the document date detected during parse.
+
+        Returns
+        -------
+        datetime.datetime | None
+            Date from the email headers, or None if not detected.
+        """
+        return self._date
+
+    def get_archive_path(self) -> Path | None:
+        """Return the path to the generated archive PDF, or None.
+
+        Returns
+        -------
+        Path | None
+            Path to the PDF produced by Gotenberg, or None if parse has not
+            been called yet.
+        """
+        return self._archive_path
+
+    # ------------------------------------------------------------------
+    # Thumbnail and metadata
+    # ------------------------------------------------------------------
+
+    def get_thumbnail(
+        self,
+        document_path: Path,
+        mime_type: str,
+        file_name: str | None = None,
+    ) -> Path:
+        """Generate a thumbnail from the PDF rendition of the email.
+
+        Converts the document to PDF first if not already done.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+        file_name:
+            Kept for backward compatibility; not used.
+
+        Returns
+        -------
+        Path
+            Path to the generated WebP thumbnail inside the temporary directory.
+        """
+        if not self._archive_path:
+            self._archive_path = self.generate_pdf(
+                self.parse_file_to_message(document_path),
+            )
+
+        return make_thumbnail_from_pdf(
+            self._archive_path,
+            self._tempdir,
+        )
+
+    def get_page_count(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> int | None:
+        """Return the number of pages in the document.
+
+        Counts pages in the archive PDF produced by a preceding parse()
+        call.  Returns ``None`` if parse() has not been called yet or if
+        no archive was produced.
+
+        Returns
+        -------
+        int | None
+            Page count of the archive PDF, or ``None``.
+        """
+        if self._archive_path is not None:
+            from paperless.parsers.utils import get_page_count_for_pdf
+
+            return get_page_count_for_pdf(self._archive_path, log=logger)
+        return None
+
+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract metadata from the email headers.
+
+        Returns email headers as metadata entries with prefix "header",
+        plus summary entries for attachments and date.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            Sorted list of metadata entries, or ``[]`` on parse failure.
+        """
+        result: list[MetadataEntry] = []
+
+        try:
+            mail = self.parse_file_to_message(document_path)
+        except ParseError as e:
+            logger.warning(
+                "Error while fetching document metadata for %s: %s",
+                document_path,
+                e,
+            )
+            return result
+
+        for key, header_values in mail.headers.items():
+            value = ", ".join(header_values)
+            try:
+                value.encode("utf-8")
+            except UnicodeEncodeError as e:  # pragma: no cover
+                logger.debug("Skipping header %s: %s", key, e)
+                continue
+
+            result.append(
+                {
+                    "namespace": "",
+                    "prefix": "header",
+                    "key": key,
+                    "value": value,
+                },
+            )
+
+        result.append(
+            {
+                "namespace": "",
+                "prefix": "",
+                "key": "attachments",
+                "value": ", ".join(
+                    f"{attachment.filename}"
+                    f"({naturalsize(attachment.size, binary=True, format='%.2f')})"
+                    for attachment in mail.attachments
+                ),
+            },
+        )
+
+        result.append(
+            {
+                "namespace": "",
+                "prefix": "",
+                "key": "date",
+                "value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
+            },
+        )
+
+        result.sort(key=lambda item: (item["prefix"], item["key"]))
+        return result
+
+    # ------------------------------------------------------------------
+    # Email-specific methods
+    # ------------------------------------------------------------------
+
+    def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
+        """Convert the OCR output type setting to a Gotenberg PdfAFormat."""
+        if settings.OCR_OUTPUT_TYPE in {
+            OutputTypeChoices.PDF_A,
+            OutputTypeChoices.PDF_A2,
+        }:
+            return PdfAFormat.A2b
+        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:  # pragma: no cover
+            logger.warning(
+                "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
+            )
+            return PdfAFormat.A2b
+        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:  # pragma: no cover
+            return PdfAFormat.A3b
+        return None
+
+    @staticmethod
+    def parse_file_to_message(filepath: Path) -> MailMessage:
+        """Parse the given .eml file into a MailMessage object.
+
+        Parameters
+        ----------
+        filepath:
+            Path to the .eml file.
+
+        Returns
+        -------
+        MailMessage
+            Parsed mail message.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If the file cannot be parsed or is missing required fields.
+        """
+        try:
+            with filepath.open("rb") as eml:
+                parsed = MailMessage.from_bytes(eml.read())
+                if parsed.from_values is None:
+                    raise ParseError(
+                        f"Could not parse {filepath}: Missing 'from'",
+                    )
+        except Exception as err:
+            raise ParseError(
+                f"Could not parse {filepath}: {err}",
+            ) from err
+
+        return parsed
+
+    def tika_parse(self, html: str) -> str:
+        """Send HTML content to the Tika server for text extraction.
+
+        Parameters
+        ----------
+        html:
+            HTML string to parse.
+
+        Returns
+        -------
+        str
+            Extracted plain text.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If the Tika server cannot be reached or returns an error.
+        """
+        logger.info("Sending content to Tika server")
+
+        try:
+            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
+                parsed = client.tika.as_text.from_buffer(html, "text/html")
+
+                if parsed.content is not None:
+                    return parsed.content.strip()
+                return ""
+        except Exception as err:
+            raise ParseError(
+                f"Could not parse content with tika server at "
+                f"{settings.TIKA_ENDPOINT}: {err}",
+            ) from err
+
+    def generate_pdf(
+        self,
+        mail_message: MailMessage,
+        pdf_layout: MailRule.PdfLayout | None = None,
+    ) -> Path:
+        """Generate a PDF from the email message.
+
+        Creates separate PDFs for the email body and HTML content, then
+        merges them according to the requested layout.
+
+        Parameters
+        ----------
+        mail_message:
+            Parsed email message.
+        pdf_layout:
+            Layout option for the PDF. Falls back to the
+            EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
+
+        Returns
+        -------
+        Path
+            Path to the generated PDF inside the temporary directory.
+        """
+        archive_path = Path(self._tempdir) / "merged.pdf"
+
+        mail_pdf_file = self.generate_pdf_from_mail(mail_message)
+
+        if pdf_layout is None:
+            pdf_layout = MailRule.PdfLayout(settings.EMAIL_PARSE_DEFAULT_LAYOUT)
+
+        # If no HTML content, create the PDF from the message.
+        # Otherwise, create 2 PDFs and merge them with Gotenberg.
+        if not mail_message.html:
+            archive_path.write_bytes(mail_pdf_file.read_bytes())
+        else:
+            pdf_of_html_content = self.generate_pdf_from_html(
+                mail_message.html,
+                mail_message.attachments,
+            )
+
+            logger.debug("Merging email text and HTML content into single PDF")
+
+            with (
+                GotenbergClient(
+                    host=settings.TIKA_GOTENBERG_ENDPOINT,
+                    timeout=settings.CELERY_TASK_TIME_LIMIT,
+                ) as client,
+                client.merge.merge() as route,
+            ):
+                # Configure requested PDF/A formatting, if any
+                pdf_a_format = self._settings_to_gotenberg_pdfa()
+                if pdf_a_format is not None:
+                    route.pdf_format(pdf_a_format)
+
+                match pdf_layout:
+                    case MailRule.PdfLayout.HTML_TEXT:
+                        route.merge([pdf_of_html_content, mail_pdf_file])
+                    case MailRule.PdfLayout.HTML_ONLY:
+                        route.merge([pdf_of_html_content])
+                    case MailRule.PdfLayout.TEXT_ONLY:
+                        route.merge([mail_pdf_file])
+                    case MailRule.PdfLayout.TEXT_HTML | _:
+                        route.merge([mail_pdf_file, pdf_of_html_content])
+
+                try:
+                    response = route.run()
+                    archive_path.write_bytes(response.content)
+                except Exception as err:
+                    raise ParseError(
+                        f"Error while merging email HTML into PDF: {err}",
+                    ) from err
+
+        return archive_path
+
+    def mail_to_html(self, mail: MailMessage) -> Path:
+        """Convert the given email into an HTML file using a template.
+
+        Parameters
+        ----------
+        mail:
+            Parsed mail message.
+
+        Returns
+        -------
+        Path
+            Path to the rendered HTML file inside the temporary directory.
+        """
+
+        def clean_html(text: str) -> str:
+            """Attempt to clean, escape, and linkify the given HTML string."""
+            if isinstance(text, list):
+                text = "\n".join([str(e) for e in text])
+            if not isinstance(text, str):
+                text = str(text)
+            text = escape(text)
+            text = clean(text)
+            text = linkify(text, parse_email=True)
+            text = text.replace("\n", "<br>")
+            return text
+
+        data = {}
+
+        data["subject"] = clean_html(mail.subject)
+        if data["subject"]:
+            data["subject_label"] = "Subject"
+        data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
+        if data["from"]:
+            data["from_label"] = "From"
+        data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
+        if data["to"]:
+            data["to_label"] = "To"
+        data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
+        if data["cc"]:
+            data["cc_label"] = "CC"
+        data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
+        if data["bcc"]:
+            data["bcc_label"] = "BCC"
+
+        att = []
+        for a in mail.attachments:
+            att.append(
+                f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
+            )
+        data["attachments"] = clean_html(", ".join(att))
+        if data["attachments"]:
+            data["attachments_label"] = "Attachments"
+
+        data["date"] = clean_html(
+            timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
+        )
+        data["content"] = clean_html(mail.text.strip())
+
+        from django.template.loader import render_to_string
+
+        html_file = Path(self._tempdir) / "email_as_html.html"
+        html_file.write_text(render_to_string("email_msg_template.html", context=data))
+
+        return html_file
+
+    def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
+        """Create a PDF from the email body using an HTML template and Gotenberg.
+
+        Parameters
+        ----------
+        mail:
+            Parsed mail message.
+
+        Returns
+        -------
+        Path
+            Path to the generated PDF inside the temporary directory.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If Gotenberg returns an error.
+        """
+        logger.info("Converting mail to PDF")
+
+        css_file = (
+            Path(__file__).parent.parent.parent
+            / "paperless_mail"
+            / "templates"
+            / "output.css"
+        )
+        email_html_file = self.mail_to_html(mail)
+
+        with (
+            GotenbergClient(
+                host=settings.TIKA_GOTENBERG_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ) as client,
+            client.chromium.html_to_pdf() as route,
+        ):
+            # Configure requested PDF/A formatting, if any
+            pdf_a_format = self._settings_to_gotenberg_pdfa()
+            if pdf_a_format is not None:
+                route.pdf_format(pdf_a_format)
+
+            try:
+                response = (
+                    route.index(email_html_file)
+                    .resource(css_file)
+                    .margins(
+                        PageMarginsType(
+                            top=Measurement(0.1, MeasurementUnitType.Inches),
+                            bottom=Measurement(0.1, MeasurementUnitType.Inches),
+                            left=Measurement(0.1, MeasurementUnitType.Inches),
+                            right=Measurement(0.1, MeasurementUnitType.Inches),
+                        ),
+                    )
+                    .size(A4)
+                    .scale(1.0)
+                    .run()
+                )
+            except Exception as err:
+                raise ParseError(
+                    f"Error while converting email to PDF: {err}",
+                ) from err
+
+        email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
+        email_as_pdf_file.write_bytes(response.content)
+
+        return email_as_pdf_file
+
+    def generate_pdf_from_html(
+        self,
+        orig_html: str,
+        attachments: list[MailAttachment],
+    ) -> Path:
+        """Generate a PDF from the HTML content of the email.
+
+        Parameters
+        ----------
+        orig_html:
+            Raw HTML string from the email body.
+        attachments:
+            List of email attachments (used as inline resources).
+
+        Returns
+        -------
+        Path
+            Path to the generated PDF inside the temporary directory.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If Gotenberg returns an error.
+        """
+
+        def clean_html_script(text: str) -> str:
+            compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
+            text = compiled_open.sub("<div hidden ", text)
+
+            compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
+            text = compiled_close.sub("</div", text)
+            return text
+
+        logger.info("Converting message html to PDF")
+
+        tempdir = Path(self._tempdir)
+
+        html_clean = clean_html_script(orig_html)
+        html_clean_file = tempdir / "index.html"
+        html_clean_file.write_text(html_clean)
+
+        with (
+            GotenbergClient(
+                host=settings.TIKA_GOTENBERG_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ) as client,
+            client.chromium.html_to_pdf() as route,
+        ):
+            # Configure requested PDF/A formatting, if any
+            pdf_a_format = self._settings_to_gotenberg_pdfa()
+            if pdf_a_format is not None:
+                route.pdf_format(pdf_a_format)
+
+            # Add attachments as resources, cleaning the filename and replacing
+            # it in the index file for inclusion
+            for attachment in attachments:
+                # Clean the attachment name to be valid
+                name_cid = f"cid:{attachment.content_id}"
+                name_clean = "".join(e for e in name_cid if e.isalnum())
+
+                # Write attachment payload to a temp file
+                temp_file = tempdir / name_clean
+                temp_file.write_bytes(attachment.payload)
+
+                route.resource(temp_file)
+
+                # Replace as needed the name with the clean name
+                html_clean = html_clean.replace(name_cid, name_clean)
+
+            # Now store the cleaned up HTML version
+            html_clean_file = tempdir / "index.html"
+            html_clean_file.write_text(html_clean)
+            # This is our index file, the main page basically
+            route.index(html_clean_file)
+
+            # Set page size, margins
+            route.margins(
+                PageMarginsType(
+                    top=Measurement(0.1, MeasurementUnitType.Inches),
+                    bottom=Measurement(0.1, MeasurementUnitType.Inches),
+                    left=Measurement(0.1, MeasurementUnitType.Inches),
+                    right=Measurement(0.1, MeasurementUnitType.Inches),
+                ),
+            ).size(A4).scale(1.0)
+
+            try:
+                response = route.run()
+
+            except Exception as err:
+                raise ParseError(
+                    f"Error while converting document to PDF: {err}",
+                ) from err
+
+        html_pdf = tempdir / "html.pdf"
+        html_pdf.write_bytes(response.content)
+        return html_pdf
--- a/src/paperless/parsers/registry.py
+++ b/src/paperless/parsers/registry.py
@@ -193,11 +193,17 @@ class ParserRegistry:
        that log output is predictable; scoring determines which parser wins
        at runtime regardless of registration order.
        """
+        from paperless.parsers.mail import MailDocumentParser
+        from paperless.parsers.remote import RemoteDocumentParser
+        from paperless.parsers.tesseract import RasterisedDocumentParser
        from paperless.parsers.text import TextDocumentParser
        from paperless.parsers.tika import TikaDocumentParser

        self.register_builtin(TextDocumentParser)
+        self.register_builtin(RemoteDocumentParser)
        self.register_builtin(TikaDocumentParser)
+        self.register_builtin(MailDocumentParser)
+        self.register_builtin(RasterisedDocumentParser)

    # ------------------------------------------------------------------
    # Discovery
@@ -298,6 +304,23 @@ class ParserRegistry:
                getattr(cls, "url", "unknown"),
            )

+    # ------------------------------------------------------------------
+    # Inspection helpers
+    # ------------------------------------------------------------------
+
+    def all_parsers(self) -> list[type[ParserProtocol]]:
+        """Return all registered parser classes (external first, then builtins).
+
+        Used by compatibility wrappers that need to iterate every parser to
+        compute the full set of supported MIME types and file extensions.
+
+        Returns
+        -------
+        list[type[ParserProtocol]]
+            External parsers followed by built-in parsers.
+        """
+        return [*self._external, *self._builtins]
+
    # ------------------------------------------------------------------
    # Parser resolution
    # ------------------------------------------------------------------
@@ -328,7 +351,7 @@ class ParserRegistry:
        mime_type:
            The detected MIME type of the file.
        filename:
-            The original filename, including extension.
+            The original filename, including extension.  May be empty in some cases
        path:
            Optional filesystem path to the file. Forwarded to each
            parser's score method.
--- a/src/paperless/parsers/remote.py
+++ b/src/paperless/parsers/remote.py
@@ -0,0 +1,433 @@
+"""
+Built-in remote-OCR document parser.
+
+Handles documents by sending them to a configured remote OCR engine
+(currently Azure AI Vision / Document Intelligence) and retrieving both
+the extracted text and a searchable PDF with an embedded text layer.
+
+When no engine is configured, ``score()`` returns ``None`` so the parser
+is effectively invisible to the registry — the tesseract parser handles
+these MIME types instead.
+"""
+
+from __future__ import annotations
+
+import logging
+import shutil
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Self
+
+from django.conf import settings
+
+from paperless.version import __full_version_str__
+
+if TYPE_CHECKING:
+    import datetime
+    from types import TracebackType
+
+    from paperless.parsers import MetadataEntry
+    from paperless.parsers import ParserContext
+
+logger = logging.getLogger("paperless.parsing.remote")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+    "application/pdf": ".pdf",
+    "image/png": ".png",
+    "image/jpeg": ".jpg",
+    "image/tiff": ".tiff",
+    "image/bmp": ".bmp",
+    "image/gif": ".gif",
+    "image/webp": ".webp",
+}
+
+
+class RemoteEngineConfig:
+    """Holds and validates the remote OCR engine configuration."""
+
+    def __init__(
+        self,
+        engine: str | None,
+        api_key: str | None = None,
+        endpoint: str | None = None,
+    ) -> None:
+        self.engine = engine
+        self.api_key = api_key
+        self.endpoint = endpoint
+
+    def engine_is_valid(self) -> bool:
+        """Return True when the engine is known and fully configured."""
+        return (
+            self.engine in ("azureai",)
+            and self.api_key is not None
+            and not (self.engine == "azureai" and self.endpoint is None)
+        )
+
+
+class RemoteDocumentParser:
+    """Parse documents via a remote OCR API (currently Azure AI Vision).
+
+    This parser sends documents to a remote engine that returns both
+    extracted text and a searchable PDF with an embedded text layer.
+    It does not depend on Tesseract or ocrmypdf.
+
+    Class attributes
+    ----------------
+    name : str
+        Human-readable parser name.
+    version : str
+        Semantic version string, kept in sync with Paperless-ngx releases.
+    author : str
+        Maintainer name.
+    url : str
+        Issue tracker / source URL.
+    """
+
+    name: str = "Paperless-ngx Remote OCR Parser"
+    version: str = __full_version_str__
+    author: str = "Paperless-ngx Contributors"
+    url: str = "https://github.com/paperless-ngx/paperless-ngx"
+
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        """Return the MIME types this parser can handle.
+
+        The full set is always returned regardless of whether a remote
+        engine is configured.  The ``score()`` method handles the
+        "am I active?" logic by returning ``None`` when not configured.
+
+        Returns
+        -------
+        dict[str, str]
+            Mapping of MIME type to preferred file extension.
+        """
+        return _SUPPORTED_MIME_TYPES
+
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        """Return the priority score for handling this file, or None.
+
+        Returns ``None`` when no valid remote engine is configured,
+        making the parser invisible to the registry for this file.
+        When configured, returns 20 — higher than the Tesseract parser's
+        default of 10 — so the remote engine takes priority.
+
+        Parameters
+        ----------
+        mime_type:
+            Detected MIME type of the file.
+        filename:
+            Original filename including extension.
+        path:
+            Optional filesystem path. Not inspected by this parser.
+
+        Returns
+        -------
+        int | None
+            20 when the remote engine is configured and the MIME type is
+            supported, otherwise None.
+        """
+        config = RemoteEngineConfig(
+            engine=settings.REMOTE_OCR_ENGINE,
+            api_key=settings.REMOTE_OCR_API_KEY,
+            endpoint=settings.REMOTE_OCR_ENDPOINT,
+        )
+        if not config.engine_is_valid():
+            return None
+        if mime_type not in _SUPPORTED_MIME_TYPES:
+            return None
+        return 20
+
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def can_produce_archive(self) -> bool:
+        """Whether this parser can produce a searchable PDF archive copy.
+
+        Returns
+        -------
+        bool
+            Always True — the remote engine always returns a PDF with an
+            embedded text layer that serves as the archive copy.
+        """
+        return True
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        """Whether the parser must produce a PDF for the frontend to display.
+
+        Returns
+        -------
+        bool
+            Always False — all supported originals are displayable by
+            the browser (PDF) or handled via the archive copy (images).
+        """
+        return False
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def __init__(self, logging_group: object = None) -> None:
+        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+        self._tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
+        self._logging_group = logging_group
+        self._text: str | None = None
+        self._archive_path: Path | None = None
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        logger.debug("Cleaning up temporary directory %s", self._tempdir)
+        shutil.rmtree(self._tempdir, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def configure(self, context: ParserContext) -> None:
+        pass
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
+        """Send the document to the remote engine and store results.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the document file to parse.
+        mime_type:
+            Detected MIME type of the document.
+        produce_archive:
+            Ignored — the remote engine always returns a searchable PDF,
+            which is stored as the archive copy regardless of this flag.
+        """
+        config = RemoteEngineConfig(
+            engine=settings.REMOTE_OCR_ENGINE,
+            api_key=settings.REMOTE_OCR_API_KEY,
+            endpoint=settings.REMOTE_OCR_ENDPOINT,
+        )
+
+        if not config.engine_is_valid():
+            logger.warning(
+                "No valid remote parser engine is configured, content will be empty.",
+            )
+            self._text = ""
+            return
+
+        if config.engine == "azureai":
+            self._text = self._azure_ai_vision_parse(document_path, config)
+
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
+
+    def get_text(self) -> str | None:
+        """Return the plain-text content extracted during parse."""
+        return self._text
+
+    def get_date(self) -> datetime.datetime | None:
+        """Return the document date detected during parse.
+
+        Returns
+        -------
+        datetime.datetime | None
+            Always None — the remote parser does not detect dates.
+        """
+        return None
+
+    def get_archive_path(self) -> Path | None:
+        """Return the path to the generated archive PDF, or None."""
+        return self._archive_path
+
+    # ------------------------------------------------------------------
+    # Thumbnail and metadata
+    # ------------------------------------------------------------------
+
+    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
+        """Generate a thumbnail image for the document.
+
+        Uses the archive PDF produced by the remote engine when available,
+        otherwise falls back to the original document path (PDF inputs).
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        Path
+            Path to the generated WebP thumbnail inside the temp directory.
+        """
+        # make_thumbnail_from_pdf lives in documents.parsers for now;
+        # it will move to paperless.parsers.utils when the tesseract
+        # parser is migrated in a later phase.
+        from documents.parsers import make_thumbnail_from_pdf
+
+        return make_thumbnail_from_pdf(
+            self._archive_path or document_path,
+            self._tempdir,
+            self._logging_group,
+        )
+
+    def get_page_count(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> int | None:
+        """Return the number of pages in a PDF document.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        int | None
+            Page count for PDF inputs, or ``None`` for other MIME types.
+        """
+        if mime_type != "application/pdf":
+            return None
+
+        from paperless.parsers.utils import get_page_count_for_pdf
+
+        return get_page_count_for_pdf(document_path, log=logger)
+
+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract format-specific metadata from the document.
+
+        Delegates to the shared pikepdf-based extractor for PDF files.
+        Returns ``[]`` for all other MIME types.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the file to extract metadata from.
+        mime_type:
+            MIME type of the file.  May be ``"application/pdf"`` when
+            called for the archive version of an image original.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            Zero or more metadata entries.
+        """
+        if mime_type != "application/pdf":
+            return []
+
+        from paperless.parsers.utils import extract_pdf_metadata
+
+        return extract_pdf_metadata(document_path, log=logger)
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    def _azure_ai_vision_parse(
+        self,
+        file: Path,
+        config: RemoteEngineConfig,
+    ) -> str | None:
+        """Send ``file`` to Azure AI Document Intelligence and return text.
+
+        Downloads the searchable PDF output from Azure and stores it at
+        ``self._archive_path``.  Returns the extracted text content, or
+        ``None`` on failure (the error is logged).
+
+        Parameters
+        ----------
+        file:
+            Absolute path to the document to analyse.
+        config:
+            Validated remote engine configuration.
+
+        Returns
+        -------
+        str | None
+            Extracted text, or None if the Azure call failed.
+        """
+        if TYPE_CHECKING:
+            # Callers must have already validated config via engine_is_valid():
+            # engine_is_valid() asserts api_key is not None and (for azureai)
+            # endpoint is not None, so these casts are provably safe.
+            assert config.endpoint is not None
+            assert config.api_key is not None
+
+        from azure.ai.documentintelligence import DocumentIntelligenceClient
+        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+        from azure.ai.documentintelligence.models import AnalyzeOutputOption
+        from azure.ai.documentintelligence.models import DocumentContentFormat
+        from azure.core.credentials import AzureKeyCredential
+
+        client = DocumentIntelligenceClient(
+            endpoint=config.endpoint,
+            credential=AzureKeyCredential(config.api_key),
+        )
+
+        try:
+            with file.open("rb") as f:
+                analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
+                poller = client.begin_analyze_document(
+                    model_id="prebuilt-read",
+                    body=analyze_request,
+                    output_content_format=DocumentContentFormat.TEXT,
+                    output=[AnalyzeOutputOption.PDF],
+                    content_type="application/json",
+                )
+
+            poller.wait()
+            result_id = poller.details["operation_id"]
+            result = poller.result()
+
+            self._archive_path = self._tempdir / "archive.pdf"
+            with self._archive_path.open("wb") as f:
+                for chunk in client.get_analyze_result_pdf(
+                    model_id="prebuilt-read",
+                    result_id=result_id,
+                ):
+                    f.write(chunk)
+
+            return result.content
+
+        except Exception as e:
+            logger.error("Azure AI Vision parsing failed: %s", e)
+
+        finally:
+            client.close()
+
+        return None
--- a/src/paperless/parsers/tesseract.py
+++ b/src/paperless/parsers/tesseract.py
@@ -1,13 +1,18 @@
+from __future__ import annotations
+
+import logging
 import os
 import re
+import shutil
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
+from typing import Any
+from typing import Self

 from django.conf import settings
 from PIL import Image

-from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from documents.utils import maybe_override_pixel_limit
@@ -16,6 +21,28 @@ from paperless.config import OcrConfig
 from paperless.models import ArchiveFileChoices
 from paperless.models import CleanChoices
 from paperless.models import ModeChoices
+from paperless.parsers.utils import read_file_handle_unicode_errors
+from paperless.version import __full_version_str__
+
+if TYPE_CHECKING:
+    import datetime
+    from types import TracebackType
+
+    from paperless.parsers import MetadataEntry
+    from paperless.parsers import ParserContext
+
+logger = logging.getLogger("paperless.parsing.tesseract")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+    "application/pdf": ".pdf",
+    "image/jpeg": ".jpg",
+    "image/png": ".png",
+    "image/tiff": ".tif",
+    "image/gif": ".gif",
+    "image/bmp": ".bmp",
+    "image/webp": ".webp",
+    "image/heic": ".heic",
+}


 class NoTextFoundException(Exception):
@@ -26,81 +53,125 @@ class RtlLanguageException(Exception):
    pass


-class RasterisedDocumentParser(DocumentParser):
+class RasterisedDocumentParser:
    """
    This parser uses Tesseract to try and get some text out of a rasterised
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """

-    logging_name = "paperless.parsing.tesseract"
+    name: str = "Paperless-ngx Tesseract OCR Parser"
+    version: str = __full_version_str__
+    author: str = "Paperless-ngx Contributors"
+    url: str = "https://github.com/paperless-ngx/paperless-ngx"

-    def get_settings(self) -> OcrConfig:
-        """
-        This parser uses the OCR configuration settings to parse documents
-        """
-        return OcrConfig()
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------

-    def get_page_count(self, document_path, mime_type):
-        page_count = None
-        if mime_type == "application/pdf":
-            try:
-                import pikepdf
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        return _SUPPORTED_MIME_TYPES

-                with pikepdf.Pdf.open(document_path) as pdf:
-                    page_count = len(pdf.pages)
-            except Exception as e:
-                self.log.warning(
-                    f"Unable to determine PDF page count {document_path}: {e}",
-                )
-        return page_count
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        if mime_type in _SUPPORTED_MIME_TYPES:
+            return 10
+        return None

-    def extract_metadata(self, document_path, mime_type):
-        result = []
-        if mime_type == "application/pdf":
-            import pikepdf
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------

-            namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+    @property
+    def can_produce_archive(self) -> bool:
+        return True

-            pdf = pikepdf.open(document_path)
-            meta = pdf.open_metadata()
-            for key, value in meta.items():
-                if isinstance(value, list):
-                    value = " ".join([str(e) for e in value])
-                value = str(value)
-                try:
-                    m = namespace_pattern.match(key)
-                    if m is None:  # pragma: no cover
-                        continue
-                    namespace = m.group(1)
-                    key_value = m.group(2)
-                    try:
-                        namespace.encode("utf-8")
-                        key_value.encode("utf-8")
-                    except UnicodeEncodeError as e:  # pragma: no cover
-                        self.log.debug(f"Skipping metadata key {key}: {e}")
-                        continue
-                    result.append(
-                        {
-                            "namespace": namespace,
-                            "prefix": meta.REVERSE_NS[namespace],
-                            "key": key_value,
-                            "value": value,
-                        },
-                    )
-                except Exception as e:
-                    self.log.warning(
-                        f"Error while reading metadata {key}: {value}. Error: {e}",
-                    )
-        return result
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        return False

-    def get_thumbnail(self, document_path, mime_type, file_name=None):
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def __init__(self, logging_group: object = None) -> None:
+        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+        self.tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
+        self.settings = OcrConfig()
+        self.archive_path: Path | None = None
+        self.text: str | None = None
+        self.date: datetime.datetime | None = None
+        self.log = logger
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        logger.debug("Cleaning up temporary directory %s", self.tempdir)
+        shutil.rmtree(self.tempdir, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def configure(self, context: ParserContext) -> None:
+        pass
+
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
+
+    def get_text(self) -> str | None:
+        return self.text
+
+    def get_date(self) -> datetime.datetime | None:
+        return self.date
+
+    def get_archive_path(self) -> Path | None:
+        return self.archive_path
+
+    # ------------------------------------------------------------------
+    # Thumbnail, page count, and metadata
+    # ------------------------------------------------------------------
+
+    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
        return make_thumbnail_from_pdf(
-            self.archive_path or document_path,
+            self.archive_path or Path(document_path),
            self.tempdir,
-            self.logging_group,
        )

-    def is_image(self, mime_type) -> bool:
+    def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
+        if mime_type == "application/pdf":
+            from paperless.parsers.utils import get_page_count_for_pdf
+
+            return get_page_count_for_pdf(Path(document_path), log=self.log)
+        return None
+
+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        if mime_type != "application/pdf":
+            return []
+
+        from paperless.parsers.utils import extract_pdf_metadata
+
+        return extract_pdf_metadata(Path(document_path), log=self.log)
+
+    def is_image(self, mime_type: str) -> bool:
        return mime_type in [
            "image/png",
            "image/jpeg",
@@ -111,25 +182,25 @@ class RasterisedDocumentParser(DocumentParser):
            "image/heic",
        ]

-    def has_alpha(self, image) -> bool:
+    def has_alpha(self, image: Path) -> bool:
        with Image.open(image) as im:
            return im.mode in ("RGBA", "LA")

-    def remove_alpha(self, image_path: str) -> Path:
+    def remove_alpha(self, image_path: Path) -> Path:
        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
        run_subprocess(
            [
                settings.CONVERT_BINARY,
                "-alpha",
                "off",
-                image_path,
-                no_alpha_image,
+                str(image_path),
+                str(no_alpha_image),
            ],
            logger=self.log,
        )
        return no_alpha_image

-    def get_dpi(self, image) -> int | None:
+    def get_dpi(self, image: Path) -> int | None:
        try:
            with Image.open(image) as im:
                x, _ = im.info["dpi"]
@@ -138,7 +209,7 @@ class RasterisedDocumentParser(DocumentParser):
            self.log.warning(f"Error while getting DPI from image {image}: {e}")
            return None

-    def calculate_a4_dpi(self, image) -> int | None:
+    def calculate_a4_dpi(self, image: Path) -> int | None:
        try:
            with Image.open(image) as im:
                width, _ = im.size
@@ -156,6 +227,7 @@ class RasterisedDocumentParser(DocumentParser):
        sidecar_file: Path | None,
        pdf_file: Path,
    ) -> str | None:
+        text: str | None = None
        # When re-doing OCR, the sidecar contains ONLY the new text, not
        # the whole text, so do not utilize it in that case
        if (
@@ -163,7 +235,7 @@ class RasterisedDocumentParser(DocumentParser):
            and sidecar_file.is_file()
            and self.settings.mode != "redo"
        ):
-            text = self.read_file_handle_unicode_errors(sidecar_file)
+            text = read_file_handle_unicode_errors(sidecar_file)

            if "[OCR skipped on page" not in text:
                # This happens when there's already text in the input file.
@@ -191,12 +263,12 @@ class RasterisedDocumentParser(DocumentParser):
                        "-layout",
                        "-enc",
                        "UTF-8",
-                        pdf_file,
+                        str(pdf_file),
                        tmp.name,
                    ],
                    logger=self.log,
                )
-                text = self.read_file_handle_unicode_errors(Path(tmp.name))
+                text = read_file_handle_unicode_errors(Path(tmp.name))

            return post_process_text(text)

@@ -211,16 +283,14 @@ class RasterisedDocumentParser(DocumentParser):

    def construct_ocrmypdf_parameters(
        self,
-        input_file,
-        mime_type,
-        output_file,
-        sidecar_file,
+        input_file: Path,
+        mime_type: str,
+        output_file: Path,
+        sidecar_file: Path,
        *,
-        safe_fallback=False,
-    ):
-        if TYPE_CHECKING:
-            assert isinstance(self.settings, OcrConfig)
-        ocrmypdf_args = {
+        safe_fallback: bool = False,
+    ) -> dict[str, Any]:
+        ocrmypdf_args: dict[str, Any] = {
            "input_file_or_options": input_file,
            "output_file": output_file,
            # need to use threads, since this will be run in daemonized
@@ -330,7 +400,13 @@ class RasterisedDocumentParser(DocumentParser):

        return ocrmypdf_args

-    def parse(self, document_path: Path, mime_type, file_name=None) -> None:
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
        # This forces tesseract to use one core per page.
        os.environ["OMP_THREAD_LIMIT"] = "1"
        VALID_TEXT_LENGTH = 50
@@ -458,7 +534,7 @@ class RasterisedDocumentParser(DocumentParser):
                self.text = ""


-def post_process_text(text):
+def post_process_text(text: str | None) -> str | None:
    if not text:
        return None

--- a/src/paperless/parsers/text.py
+++ b/src/paperless/parsers/text.py
@@ -27,6 +27,7 @@ if TYPE_CHECKING:
    from types import TracebackType

    from paperless.parsers import MetadataEntry
+    from paperless.parsers import ParserContext

 logger = logging.getLogger("paperless.parsing.text")

@@ -156,6 +157,9 @@ class TextDocumentParser:
    # Core parsing interface
    # ------------------------------------------------------------------

+    def configure(self, context: ParserContext) -> None:
+        pass
+
    def parse(
        self,
        document_path: Path,
--- a/src/paperless/parsers/tika.py
+++ b/src/paperless/parsers/tika.py
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
    from types import TracebackType

    from paperless.parsers import MetadataEntry
+    from paperless.parsers import ParserContext

 logger = logging.getLogger("paperless.parsing.tika")

@@ -205,6 +206,9 @@ class TikaDocumentParser:
    # Core parsing interface
    # ------------------------------------------------------------------

+    def configure(self, context: ParserContext) -> None:
+        pass
+
    def parse(
        self,
        document_path: Path,
@@ -340,11 +344,19 @@ class TikaDocumentParser:
    ) -> int | None:
        """Return the number of pages in the document.

+        Counts pages in the archive PDF produced by a preceding parse()
+        call.  Returns ``None`` if parse() has not been called yet or if
+        no archive was produced.
+
        Returns
        -------
        int | None
-            Always None — page count is not available from Tika.
+            Page count of the archive PDF, or ``None``.
        """
+        if self._archive_path is not None:
+            from paperless.parsers.utils import get_page_count_for_pdf
+
+            return get_page_count_for_pdf(self._archive_path, log=logger)
        return None

    def extract_metadata(
--- a/src/paperless/parsers/utils.py
+++ b/src/paperless/parsers/utils.py
@@ -0,0 +1,158 @@
+"""
+Shared utilities for Paperless-ngx document parsers.
+
+Functions here are format-neutral helpers that multiple parsers need.
+Keeping them here avoids parsers inheriting from each other just to
+share implementation.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from paperless.parsers import MetadataEntry
+
+logger = logging.getLogger("paperless.parsers.utils")
+
+
+def read_file_handle_unicode_errors(
+    filepath: Path,
+    log: logging.Logger | None = None,
+) -> str:
+    """Read a file as UTF-8 text, replacing invalid bytes rather than raising.
+
+    Parameters
+    ----------
+    filepath:
+        Absolute path to the file to read.
+    log:
+        Logger to use for warnings.  Falls back to the module-level logger
+        when omitted.
+
+    Returns
+    -------
+    str
+        File content as a string, with any invalid UTF-8 sequences replaced
+        by the Unicode replacement character.
+    """
+    _log = log or logger
+    try:
+        return filepath.read_text(encoding="utf-8")
+    except UnicodeDecodeError as e:
+        _log.warning("Unicode error during text reading, continuing: %s", e)
+        return filepath.read_bytes().decode("utf-8", errors="replace")
+
+
+def get_page_count_for_pdf(
+    document_path: Path,
+    log: logging.Logger | None = None,
+) -> int | None:
+    """Return the number of pages in a PDF file using pikepdf.
+
+    Parameters
+    ----------
+    document_path:
+        Absolute path to the PDF file.
+    log:
+        Logger to use for warnings.  Falls back to the module-level logger
+        when omitted.
+
+    Returns
+    -------
+    int | None
+        Page count, or ``None`` if the file cannot be opened or is not a
+        valid PDF.
+    """
+    import pikepdf
+
+    _log = log or logger
+
+    try:
+        with pikepdf.Pdf.open(document_path) as pdf:
+            return len(pdf.pages)
+    except Exception as e:
+        _log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
+        return None
+
+
+def extract_pdf_metadata(
+    document_path: Path,
+    log: logging.Logger | None = None,
+) -> list[MetadataEntry]:
+    """Extract XMP/PDF metadata from a PDF file using pikepdf.
+
+    Reads all XMP metadata entries from the document and returns them as a
+    list of ``MetadataEntry`` dicts.  The method never raises — any failure
+    to open the file or read a specific key is logged and skipped.
+
+    Parameters
+    ----------
+    document_path:
+        Absolute path to the PDF file.
+    log:
+        Logger to use for warnings and debug messages.  Falls back to the
+        module-level logger when omitted.
+
+    Returns
+    -------
+    list[MetadataEntry]
+        Zero or more metadata entries.  Returns ``[]`` if the file cannot
+        be opened or contains no readable XMP metadata.
+    """
+    import pikepdf
+
+    from paperless.parsers import MetadataEntry
+
+    _log = log or logger
+    result: list[MetadataEntry] = []
+    namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+
+    try:
+        pdf = pikepdf.open(document_path)
+        meta = pdf.open_metadata()
+    except Exception as e:
+        _log.warning("Could not open PDF metadata for %s: %s", document_path, e)
+        return []
+
+    for key, value in meta.items():
+        if isinstance(value, list):
+            value = " ".join(str(e) for e in value)
+        value = str(value)
+
+        try:
+            m = namespace_pattern.match(key)
+            if m is None:
+                continue
+
+            namespace = m.group(1)
+            key_value = m.group(2)
+
+            try:
+                namespace.encode("utf-8")
+                key_value.encode("utf-8")
+            except UnicodeEncodeError as enc_err:
+                _log.debug("Skipping metadata key %s: %s", key, enc_err)
+                continue
+
+            result.append(
+                MetadataEntry(
+                    namespace=namespace,
+                    prefix=meta.REVERSE_NS[namespace],
+                    key=key_value,
+                    value=value,
+                ),
+            )
+        except Exception as e:
+            _log.warning(
+                "Error reading metadata key %s value %s: %s",
+                key,
+                value,
+                e,
+            )
+
+    return result
--- a/src/paperless/settings/init.py
+++ b/src/paperless/settings/init.py
@@ -121,10 +121,7 @@ INSTALLED_APPS = [
    "django_extensions",
    "paperless",
    "documents.apps.DocumentsConfig",
-    "paperless_tesseract.apps.PaperlessTesseractConfig",
-    "paperless_text.apps.PaperlessTextConfig",
    "paperless_mail.apps.PaperlessMailConfig",
-    "paperless_remote.apps.PaperlessRemoteParserConfig",
    "django.contrib.admin",
    "rest_framework",
    "rest_framework.authtoken",
@@ -974,8 +971,8 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv(
    "http://localhost:3000",
 )

-if TIKA_ENABLED:
-    INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
+# Tika parser is now integrated into the main parser registry
+# No separate Django app needed

 AUDIT_LOG_ENABLED = get_bool_from_env("PAPERLESS_AUDIT_LOG_ENABLED", "true")
 if AUDIT_LOG_ENABLED:
--- a/src/paperless/tests/parsers/conftest.py
+++ b/src/paperless/tests/parsers/conftest.py
@@ -6,16 +6,29 @@ so it is easy to see which files belong to which test module.

 from __future__ import annotations

+from contextlib import contextmanager
 from typing import TYPE_CHECKING

 import pytest
+from django.test import override_settings

+from paperless.parsers.mail import MailDocumentParser
+from paperless.parsers.remote import RemoteDocumentParser
+from paperless.parsers.tesseract import RasterisedDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser

 if TYPE_CHECKING:
+    from collections.abc import Callable
    from collections.abc import Generator
    from pathlib import Path
+    from unittest.mock import MagicMock
+
+    from pytest_django.fixtures import SettingsWrapper
+    from pytest_mock import MockerFixture
+
+    #: Type for the ``make_tesseract_parser`` fixture factory.
+    MakeTesseractParser = Callable[..., Generator[RasterisedDocumentParser, None, None]]


 # ------------------------------------------------------------------
@@ -77,6 +90,92 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
        yield parser


+# ------------------------------------------------------------------
+# Remote parser sample files
+# ------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def remote_samples_dir(samples_dir: Path) -> Path:
+    """Absolute path to the remote parser sample files directory.
+
+    Returns
+    -------
+    Path
+        ``<samples_dir>/remote/``
+    """
+    return samples_dir / "remote"
+
+
+@pytest.fixture(scope="session")
+def sample_pdf_file(remote_samples_dir: Path) -> Path:
+    """Path to a simple digital PDF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``remote/simple-digital.pdf``.
+    """
+    return remote_samples_dir / "simple-digital.pdf"
+
+
+# ------------------------------------------------------------------
+# Remote parser instance
+# ------------------------------------------------------------------
+
+
+@pytest.fixture()
+def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
+    """Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
+
+    Yields
+    ------
+    RemoteDocumentParser
+        A ready-to-use parser instance.
+    """
+    with RemoteDocumentParser() as parser:
+        yield parser
+
+
+# ------------------------------------------------------------------
+# Remote parser settings helpers
+# ------------------------------------------------------------------
+
+
+@pytest.fixture()
+def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
+    """Configure Django settings for a valid Azure AI OCR engine.
+
+    Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
+    ``REMOTE_OCR_ENDPOINT`` to test values.  Settings are restored
+    automatically after the test by pytest-django.
+
+    Returns
+    -------
+    SettingsWrapper
+        The modified settings object (for chaining further overrides).
+    """
+    settings.REMOTE_OCR_ENGINE = "azureai"
+    settings.REMOTE_OCR_API_KEY = "test-api-key"
+    settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
+    return settings
+
+
+@pytest.fixture()
+def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
+    """Configure Django settings with no remote engine configured.
+
+    Returns
+    -------
+    SettingsWrapper
+        The modified settings object.
+    """
+    settings.REMOTE_OCR_ENGINE = None
+    settings.REMOTE_OCR_API_KEY = None
+    settings.REMOTE_OCR_ENDPOINT = None
+    return settings
+
+
 # ------------------------------------------------------------------
 # Tika parser sample files
 # ------------------------------------------------------------------
@@ -158,3 +257,544 @@ def tika_parser() -> Generator[TikaDocumentParser, None, None]:
    """
    with TikaDocumentParser() as parser:
        yield parser
+
+
+# ------------------------------------------------------------------
+# Mail parser sample files
+# ------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def mail_samples_dir(samples_dir: Path) -> Path:
+    """Absolute path to the mail parser sample files directory.
+
+    Returns
+    -------
+    Path
+        ``<samples_dir>/mail/``
+    """
+    return samples_dir / "mail"
+
+
+@pytest.fixture(scope="session")
+def broken_email_file(mail_samples_dir: Path) -> Path:
+    """Path to a broken/malformed EML sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``mail/broken.eml``.
+    """
+    return mail_samples_dir / "broken.eml"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_file(mail_samples_dir: Path) -> Path:
+    """Path to a plain-text email sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``mail/simple_text.eml``.
+    """
+    return mail_samples_dir / "simple_text.eml"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_pdf_file(mail_samples_dir: Path) -> Path:
+    """Path to the expected PDF rendition of the plain-text email.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``mail/simple_text.eml.pdf``.
+    """
+    return mail_samples_dir / "simple_text.eml.pdf"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_thumbnail_file(mail_samples_dir: Path) -> Path:
+    """Path to the expected thumbnail for the plain-text email.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``mail/simple_text.eml.pdf.webp``.
+    """
+    return mail_samples_dir / "simple_text.eml.pdf.webp"
+
+
+@pytest.fixture(scope="session")
+def html_email_file(mail_samples_dir: Path) -> Path:
+    """Path to an HTML email sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``mail/html.eml``.
+    """
+    return mail_samples_dir / "html.eml"
+
+
+@pytest.fixture(scope="session")
+def html_email_pdf_file(mail_samples_dir: Path) -> Path:
+    """Path to the expected PDF rendition of the HTML email.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``mail/html.eml.pdf``.
+    """
+    return mail_samples_dir / "html.eml.pdf"
+
+
+@pytest.fixture(scope="session")
+def html_email_thumbnail_file(mail_samples_dir: Path) -> Path:
+    """Path to the expected thumbnail for the HTML email.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``mail/html.eml.pdf.webp``.
+    """
+    return mail_samples_dir / "html.eml.pdf.webp"
+
+
+@pytest.fixture(scope="session")
+def html_email_html_file(mail_samples_dir: Path) -> Path:
+    """Path to the HTML body of the HTML email sample.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``mail/html.eml.html``.
+    """
+    return mail_samples_dir / "html.eml.html"
+
+
+@pytest.fixture(scope="session")
+def merged_pdf_first(mail_samples_dir: Path) -> Path:
+    """Path to the first PDF used in PDF-merge tests.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``mail/first.pdf``.
+    """
+    return mail_samples_dir / "first.pdf"
+
+
+@pytest.fixture(scope="session")
+def merged_pdf_second(mail_samples_dir: Path) -> Path:
+    """Path to the second PDF used in PDF-merge tests.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``mail/second.pdf``.
+    """
+    return mail_samples_dir / "second.pdf"
+
+
+# ------------------------------------------------------------------
+# Mail parser instance
+# ------------------------------------------------------------------
+
+
+@pytest.fixture()
+def mail_parser() -> Generator[MailDocumentParser, None, None]:
+    """Yield a MailDocumentParser and clean up its temporary directory afterwards.
+
+    Yields
+    ------
+    MailDocumentParser
+        A ready-to-use parser instance.
+    """
+    with MailDocumentParser() as parser:
+        yield parser
+
+
+@pytest.fixture(scope="session")
+def nginx_base_url() -> Generator[str, None, None]:
+    """
+    The base URL for the nginx HTTP server we expect to be alive
+    """
+    yield "http://localhost:8080"
+
+
+# ------------------------------------------------------------------
+# Tesseract parser sample files
+# ------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def tesseract_samples_dir(samples_dir: Path) -> Path:
+    """Absolute path to the tesseract parser sample files directory.
+
+    Returns
+    -------
+    Path
+        ``<samples_dir>/tesseract/``
+    """
+    return samples_dir / "tesseract"
+
+
+@pytest.fixture(scope="session")
+def document_webp_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a WebP document sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/document.webp``.
+    """
+    return tesseract_samples_dir / "document.webp"
+
+
+@pytest.fixture(scope="session")
+def encrypted_pdf_file(tesseract_samples_dir: Path) -> Path:
+    """Path to an encrypted PDF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/encrypted.pdf``.
+    """
+    return tesseract_samples_dir / "encrypted.pdf"
+
+
+@pytest.fixture(scope="session")
+def multi_page_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a multi-page digital PDF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/multi-page-digital.pdf``.
+    """
+    return tesseract_samples_dir / "multi-page-digital.pdf"
+
+
+@pytest.fixture(scope="session")
+def multi_page_images_alpha_rgb_tiff_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a multi-page TIFF with alpha channel in RGB.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/multi-page-images-alpha-rgb.tiff``.
+    """
+    return tesseract_samples_dir / "multi-page-images-alpha-rgb.tiff"
+
+
+@pytest.fixture(scope="session")
+def multi_page_images_alpha_tiff_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a multi-page TIFF with alpha channel.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/multi-page-images-alpha.tiff``.
+    """
+    return tesseract_samples_dir / "multi-page-images-alpha.tiff"
+
+
+@pytest.fixture(scope="session")
+def multi_page_images_pdf_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a multi-page PDF with images.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/multi-page-images.pdf``.
+    """
+    return tesseract_samples_dir / "multi-page-images.pdf"
+
+
+@pytest.fixture(scope="session")
+def multi_page_images_tiff_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a multi-page TIFF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/multi-page-images.tiff``.
+    """
+    return tesseract_samples_dir / "multi-page-images.tiff"
+
+
+@pytest.fixture(scope="session")
+def multi_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a multi-page mixed PDF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/multi-page-mixed.pdf``.
+    """
+    return tesseract_samples_dir / "multi-page-mixed.pdf"
+
+
+@pytest.fixture(scope="session")
+def no_text_alpha_png_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a PNG with alpha channel and no text.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/no-text-alpha.png``.
+    """
+    return tesseract_samples_dir / "no-text-alpha.png"
+
+
+@pytest.fixture(scope="session")
+def rotated_pdf_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a rotated PDF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/rotated.pdf``.
+    """
+    return tesseract_samples_dir / "rotated.pdf"
+
+
+@pytest.fixture(scope="session")
+def rtl_test_pdf_file(tesseract_samples_dir: Path) -> Path:
+    """Path to an RTL test PDF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/rtl-test.pdf``.
+    """
+    return tesseract_samples_dir / "rtl-test.pdf"
+
+
+@pytest.fixture(scope="session")
+def signed_pdf_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a signed PDF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/signed.pdf``.
+    """
+    return tesseract_samples_dir / "signed.pdf"
+
+
+@pytest.fixture(scope="session")
+def simple_alpha_png_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a simple PNG with alpha channel.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/simple-alpha.png``.
+    """
+    return tesseract_samples_dir / "simple-alpha.png"
+
+
+@pytest.fixture(scope="session")
+def simple_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a simple digital PDF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/simple-digital.pdf``.
+    """
+    return tesseract_samples_dir / "simple-digital.pdf"
+
+
+@pytest.fixture(scope="session")
+def simple_no_dpi_png_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a simple PNG without DPI information.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/simple-no-dpi.png``.
+    """
+    return tesseract_samples_dir / "simple-no-dpi.png"
+
+
+@pytest.fixture(scope="session")
+def simple_bmp_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a simple BMP sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/simple.bmp``.
+    """
+    return tesseract_samples_dir / "simple.bmp"
+
+
+@pytest.fixture(scope="session")
+def simple_gif_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a simple GIF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/simple.gif``.
+    """
+    return tesseract_samples_dir / "simple.gif"
+
+
+@pytest.fixture(scope="session")
+def simple_heic_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a simple HEIC sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/simple.heic``.
+    """
+    return tesseract_samples_dir / "simple.heic"
+
+
+@pytest.fixture(scope="session")
+def simple_jpg_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a simple JPG sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/simple.jpg``.
+    """
+    return tesseract_samples_dir / "simple.jpg"
+
+
+@pytest.fixture(scope="session")
+def simple_png_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a simple PNG sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/simple.png``.
+    """
+    return tesseract_samples_dir / "simple.png"
+
+
+@pytest.fixture(scope="session")
+def simple_tif_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a simple TIF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/simple.tif``.
+    """
+    return tesseract_samples_dir / "simple.tif"
+
+
+@pytest.fixture(scope="session")
+def single_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a single-page mixed PDF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/single-page-mixed.pdf``.
+    """
+    return tesseract_samples_dir / "single-page-mixed.pdf"
+
+
+@pytest.fixture(scope="session")
+def with_form_pdf_file(tesseract_samples_dir: Path) -> Path:
+    """Path to a PDF with form sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tesseract/with-form.pdf``.
+    """
+    return tesseract_samples_dir / "with-form.pdf"
+
+
+# ------------------------------------------------------------------
+# Tesseract parser instance and settings helpers
+# ------------------------------------------------------------------
+
+
+@pytest.fixture()
+def null_app_config(mocker: MockerFixture) -> MagicMock:
+    """Return a MagicMock with all OcrConfig fields set to None.
+
+    This allows the parser to fall back to Django settings instead of
+    hitting the database.
+
+    Returns
+    -------
+    MagicMock
+        Mock config with all fields as None
+    """
+    return mocker.MagicMock(
+        output_type=None,
+        pages=None,
+        language=None,
+        mode=None,
+        skip_archive_file=None,
+        image_dpi=None,
+        unpaper_clean=None,
+        deskew=None,
+        rotate_pages=None,
+        rotate_pages_threshold=None,
+        max_image_pixels=None,
+        color_conversion_strategy=None,
+        user_args=None,
+    )
+
+
+@pytest.fixture()
+def tesseract_parser(
+    mocker: MockerFixture,
+    null_app_config: MagicMock,
+) -> Generator[RasterisedDocumentParser, None, None]:
+    """Yield a RasterisedDocumentParser and clean up its temporary directory afterwards.
+
+    Patches the config system to avoid database access.
+
+    Yields
+    ------
+    RasterisedDocumentParser
+        A ready-to-use parser instance.
+    """
+    mocker.patch(
+        "paperless.config.BaseConfig._get_config_instance",
+        return_value=null_app_config,
+    )
+    with RasterisedDocumentParser() as parser:
+        yield parser
+
+
+@pytest.fixture()
+def make_tesseract_parser(
+    mocker: MockerFixture,
+    null_app_config: MagicMock,
+) -> MakeTesseractParser:
+    """Return a factory for creating RasterisedDocumentParser with Django settings overrides.
+
+    This fixture is useful for tests that need to create parsers with different
+    settings configurations.
+
+    Returns
+    -------
+    Callable[..., contextmanager[RasterisedDocumentParser]]
+        A context manager factory that accepts Django settings overrides
+    """
+    mocker.patch(
+        "paperless.config.BaseConfig._get_config_instance",
+        return_value=null_app_config,
+    )
+
+    @contextmanager
+    def _make_parser(**django_settings_overrides):
+        with override_settings(**django_settings_overrides):
+            with RasterisedDocumentParser() as parser:
+                yield parser
+
+    return _make_parser
--- a/src/paperless/tests/parsers/test_mail_parser.py
+++ b/src/paperless/tests/parsers/test_mail_parser.py
@@ -12,7 +12,64 @@ from pytest_httpx import HTTPXMock
 from pytest_mock import MockerFixture

 from documents.parsers import ParseError
-from paperless_mail.parsers import MailDocumentParser
+from paperless.parsers import ParserContext
+from paperless.parsers import ParserProtocol
+from paperless.parsers.mail import MailDocumentParser
+
+
+class TestMailParserProtocol:
+    """Verify that MailDocumentParser satisfies the ParserProtocol contract."""
+
+    def test_isinstance_satisfies_protocol(
+        self,
+        mail_parser: MailDocumentParser,
+    ) -> None:
+        assert isinstance(mail_parser, ParserProtocol)
+
+    def test_supported_mime_types(self) -> None:
+        mime_types = MailDocumentParser.supported_mime_types()
+        assert isinstance(mime_types, dict)
+        assert "message/rfc822" in mime_types
+
+    @pytest.mark.parametrize(
+        ("mime_type", "expected"),
+        [
+            ("message/rfc822", 10),
+            ("application/pdf", None),
+            ("text/plain", None),
+        ],
+    )
+    def test_score(self, mime_type: str, expected: int | None) -> None:
+        assert MailDocumentParser.score(mime_type, "email.eml") == expected
+
+    def test_can_produce_archive_is_false(
+        self,
+        mail_parser: MailDocumentParser,
+    ) -> None:
+        assert mail_parser.can_produce_archive is False
+
+    def test_requires_pdf_rendition_is_true(
+        self,
+        mail_parser: MailDocumentParser,
+    ) -> None:
+        assert mail_parser.requires_pdf_rendition is True
+
+    def test_get_page_count_returns_none_without_archive(
+        self,
+        mail_parser: MailDocumentParser,
+        html_email_file: Path,
+    ) -> None:
+        assert mail_parser.get_page_count(html_email_file, "message/rfc822") is None
+
+    def test_get_page_count_returns_int_with_pdf_archive(
+        self,
+        mail_parser: MailDocumentParser,
+        simple_txt_email_pdf_file: Path,
+    ) -> None:
+        mail_parser._archive_path = simple_txt_email_pdf_file
+        count = mail_parser.get_page_count(simple_txt_email_pdf_file, "message/rfc822")
+        assert isinstance(count, int)
+        assert count > 0


 class TestEmailFileParsing:
@@ -24,7 +81,7 @@ class TestEmailFileParsing:
    def test_parse_error_missing_file(
        self,
        mail_parser: MailDocumentParser,
-        sample_dir: Path,
+        mail_samples_dir: Path,
    ) -> None:
        """
        GIVEN:
@@ -35,7 +92,7 @@ class TestEmailFileParsing:
            - An Exception is thrown
        """
        # Check if exception is raised when parsing fails.
-        test_file = sample_dir / "doesntexist.eml"
+        test_file = mail_samples_dir / "doesntexist.eml"

        assert not test_file.exists()

@@ -246,12 +303,12 @@ class TestEmailThumbnailGenerate:
        """
        mocked_return = "Passing the return value through.."
        mock_make_thumbnail_from_pdf = mocker.patch(
-            "paperless_mail.parsers.make_thumbnail_from_pdf",
+            "paperless.parsers.mail.make_thumbnail_from_pdf",
        )
        mock_make_thumbnail_from_pdf.return_value = mocked_return

        mock_generate_pdf = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
        )
        mock_generate_pdf.return_value = "Mocked return value.."

@@ -260,8 +317,7 @@ class TestEmailThumbnailGenerate:
        mock_generate_pdf.assert_called_once()
        mock_make_thumbnail_from_pdf.assert_called_once_with(
            "Mocked return value..",
-            mail_parser.tempdir,
-            None,
+            mail_parser._tempdir,
        )

        assert mocked_return == thumb
@@ -373,7 +429,7 @@ class TestParser:
        """
        # Validate parsing returns the expected results
        mock_generate_pdf = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
        )

        mail_parser.parse(simple_txt_email_file, "message/rfc822")
@@ -385,7 +441,7 @@ class TestParser:
            "BCC: fdf@fvf.de\n\n"
            "\n\nThis is just a simple Text Mail."
        )
-        assert text_expected == mail_parser.text
+        assert text_expected == mail_parser.get_text()
        assert (
            datetime.datetime(
                2022,
@@ -396,7 +452,7 @@ class TestParser:
                43,
                tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
            )
-            == mail_parser.date
+            == mail_parser.get_date()
        )

        # Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
@@ -419,7 +475,7 @@ class TestParser:
        """

        mock_generate_pdf = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
        )

        # Validate parsing returns the expected results
@@ -443,7 +499,7 @@ class TestParser:
        mail_parser.parse(html_email_file, "message/rfc822")

        mock_generate_pdf.assert_called_once()
-        assert text_expected == mail_parser.text
+        assert text_expected == mail_parser.get_text()
        assert (
            datetime.datetime(
                2022,
@@ -454,7 +510,7 @@ class TestParser:
                19,
                tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
            )
-            == mail_parser.date
+            == mail_parser.get_date()
        )

    def test_generate_pdf_parse_error(
@@ -501,7 +557,7 @@ class TestParser:

        mail_parser.parse(simple_txt_email_file, "message/rfc822")

-        assert mail_parser.archive_path is not None
+        assert mail_parser.get_archive_path() is not None

    @pytest.mark.httpx_mock(can_send_already_matched_responses=True)
    def test_generate_pdf_html_email(
@@ -542,7 +598,7 @@ class TestParser:
        )
        mail_parser.parse(html_email_file, "message/rfc822")

-        assert mail_parser.archive_path is not None
+        assert mail_parser.get_archive_path() is not None

    def test_generate_pdf_html_email_html_to_pdf_failure(
        self,
@@ -712,10 +768,10 @@ class TestParser:

        def test_layout_option(layout_option, expected_calls, expected_pdf_names):
            mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
+            mail_parser.configure(ParserContext(mailrule_id=1))
            mail_parser.parse(
                document_path=html_email_file,
                mime_type="message/rfc822",
-                mailrule_id=1,
            )
            args, _ = mock_merge_route.call_args
            assert len(args[0]) == expected_calls
--- a/src/paperless/tests/parsers/test_mail_parser_live.py
+++ b/src/paperless/tests/parsers/test_mail_parser_live.py
@@ -11,7 +11,7 @@ from PIL import Image
 from pytest_mock import MockerFixture

 from documents.tests.utils import util_call_with_backoff
-from paperless_mail.parsers import MailDocumentParser
+from paperless.parsers.mail import MailDocumentParser


 def extract_text(pdf_path: Path) -> str:
@@ -159,7 +159,7 @@ class TestParserLive:
            - The returned thumbnail image file shall match the expected hash
        """
        mock_generate_pdf = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf",
        )
        mock_generate_pdf.return_value = simple_txt_email_pdf_file

@@ -216,10 +216,10 @@ class TestParserLive:
            - The merged PDF shall contain text from both source PDFs
        """
        mock_generate_pdf_from_html = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf_from_html",
        )
        mock_generate_pdf_from_mail = mocker.patch(
-            "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
+            "paperless.parsers.mail.MailDocumentParser.generate_pdf_from_mail",
        )
        mock_generate_pdf_from_mail.return_value = merged_pdf_first
        mock_generate_pdf_from_html.return_value = merged_pdf_second
--- a/src/paperless/tests/parsers/test_remote_parser.py
+++ b/src/paperless/tests/parsers/test_remote_parser.py
@@ -0,0 +1,497 @@
+"""
+Tests for paperless.parsers.remote.RemoteDocumentParser.
+
+All tests use the context-manager protocol for parser lifecycle.
+
+Fixture layout
+--------------
+make_azure_mock  — factory (defined here; specific to this module)
+azure_client     — composes azure_settings + make_azure_mock + patch;
+                   use when a test needs the client to succeed
+failing_azure_client
+                 — composes azure_settings + patch with RuntimeError;
+                   use when a test needs the client to fail
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from unittest.mock import Mock
+
+import pytest
+
+from paperless.parsers import ParserContext
+from paperless.parsers import ParserProtocol
+from paperless.parsers.remote import RemoteDocumentParser
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from pathlib import Path
+
+    from pytest_django.fixtures import SettingsWrapper
+    from pytest_mock import MockerFixture
+
+
+# ---------------------------------------------------------------------------
+# Module-local fixtures
+# ---------------------------------------------------------------------------
+
+_AZURE_CLIENT_TARGET = "azure.ai.documentintelligence.DocumentIntelligenceClient"
+_DEFAULT_TEXT = "Extracted text."
+
+
+@pytest.fixture()
+def make_azure_mock() -> Callable[[str], Mock]:
+    """Return a factory that builds a mock Azure DocumentIntelligenceClient.
+
+    Usage::
+
+        mock_client = make_azure_mock()            # default extracted text
+        mock_client = make_azure_mock("My text.")  # custom extracted text
+    """
+
+    def _factory(text: str = _DEFAULT_TEXT) -> Mock:
+        mock_client = Mock()
+        mock_poller = Mock()
+        mock_poller.wait.return_value = None
+        mock_poller.details = {"operation_id": "fake-op-id"}
+        mock_poller.result.return_value.content = text
+        mock_client.begin_analyze_document.return_value = mock_poller
+        mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
+        return mock_client
+
+    return _factory
+
+
+@pytest.fixture()
+def azure_client(
+    azure_settings: SettingsWrapper,
+    make_azure_mock: Callable[[str], Mock],
+    mocker: MockerFixture,
+) -> Mock:
+    """Patch the Azure DI client with a succeeding mock and return the instance.
+
+    Implicitly applies ``azure_settings`` so tests using this fixture do not
+    also need ``@pytest.mark.usefixtures("azure_settings")``.
+    """
+    mock_client = make_azure_mock()
+    mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
+    return mock_client
+
+
+@pytest.fixture()
+def failing_azure_client(
+    azure_settings: SettingsWrapper,
+    mocker: MockerFixture,
+) -> Mock:
+    """Patch the Azure DI client to raise RuntimeError on every call.
+
+    Implicitly applies ``azure_settings``.  Returns the mock instance so
+    tests can assert on calls such as ``close()``.
+    """
+    mock_client = Mock()
+    mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
+    mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
+    return mock_client
+
+
+# ---------------------------------------------------------------------------
+# Protocol contract
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserProtocol:
+    """Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""
+
+    def test_isinstance_satisfies_protocol(
+        self,
+        remote_parser: RemoteDocumentParser,
+    ) -> None:
+        assert isinstance(remote_parser, ParserProtocol)
+
+    def test_class_attributes_present(self) -> None:
+        assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
+        assert (
+            isinstance(RemoteDocumentParser.version, str)
+            and RemoteDocumentParser.version
+        )
+        assert (
+            isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
+        )
+        assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url
+
+
+# ---------------------------------------------------------------------------
+# supported_mime_types
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserSupportedMimeTypes:
+    """supported_mime_types() always returns the full set regardless of config."""
+
+    def test_returns_dict(self) -> None:
+        mime_types = RemoteDocumentParser.supported_mime_types()
+        assert isinstance(mime_types, dict)
+
+    def test_includes_all_expected_types(self) -> None:
+        mime_types = RemoteDocumentParser.supported_mime_types()
+        expected = {
+            "application/pdf",
+            "image/png",
+            "image/jpeg",
+            "image/tiff",
+            "image/bmp",
+            "image/gif",
+            "image/webp",
+        }
+        assert expected == set(mime_types.keys())
+
+    @pytest.mark.usefixtures("no_engine_settings")
+    def test_returns_full_set_when_not_configured(self) -> None:
+        """
+        GIVEN: No remote engine is configured
+        WHEN:  supported_mime_types() is called
+        THEN:  The full MIME type dict is still returned (score() handles activation)
+        """
+        mime_types = RemoteDocumentParser.supported_mime_types()
+        assert len(mime_types) == 7
+
+
+# ---------------------------------------------------------------------------
+# score()
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserScore:
+    """score() encodes the activation logic: None when unconfigured, 20 when active."""
+
+    @pytest.mark.usefixtures("azure_settings")
+    @pytest.mark.parametrize(
+        "mime_type",
+        [
+            pytest.param("application/pdf", id="pdf"),
+            pytest.param("image/png", id="png"),
+            pytest.param("image/jpeg", id="jpeg"),
+            pytest.param("image/tiff", id="tiff"),
+            pytest.param("image/bmp", id="bmp"),
+            pytest.param("image/gif", id="gif"),
+            pytest.param("image/webp", id="webp"),
+        ],
+    )
+    def test_score_returns_20_when_configured(self, mime_type: str) -> None:
+        result = RemoteDocumentParser.score(mime_type, "doc.pdf")
+        assert result == 20
+
+    @pytest.mark.usefixtures("no_engine_settings")
+    @pytest.mark.parametrize(
+        "mime_type",
+        [
+            pytest.param("application/pdf", id="pdf"),
+            pytest.param("image/png", id="png"),
+            pytest.param("image/jpeg", id="jpeg"),
+        ],
+    )
+    def test_score_returns_none_when_no_engine(self, mime_type: str) -> None:
+        result = RemoteDocumentParser.score(mime_type, "doc.pdf")
+        assert result is None
+
+    def test_score_returns_none_when_api_key_missing(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.REMOTE_OCR_ENGINE = "azureai"
+        settings.REMOTE_OCR_API_KEY = None
+        settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
+        result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
+        assert result is None
+
+    def test_score_returns_none_when_endpoint_missing(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.REMOTE_OCR_ENGINE = "azureai"
+        settings.REMOTE_OCR_API_KEY = "key"
+        settings.REMOTE_OCR_ENDPOINT = None
+        result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
+        assert result is None
+
+    @pytest.mark.usefixtures("azure_settings")
+    def test_score_returns_none_for_unsupported_mime_type(self) -> None:
+        result = RemoteDocumentParser.score("text/plain", "doc.txt")
+        assert result is None
+
+    @pytest.mark.usefixtures("azure_settings")
+    def test_score_higher_than_tesseract_default(self) -> None:
+        """Remote parser (20) outranks the tesseract default (10) when configured."""
+        score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
+        assert score is not None and score > 10
+
+
+# ---------------------------------------------------------------------------
+# Properties
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserProperties:
+    def test_can_produce_archive_is_true(
+        self,
+        remote_parser: RemoteDocumentParser,
+    ) -> None:
+        assert remote_parser.can_produce_archive is True
+
+    def test_requires_pdf_rendition_is_false(
+        self,
+        remote_parser: RemoteDocumentParser,
+    ) -> None:
+        assert remote_parser.requires_pdf_rendition is False
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserLifecycle:
+    def test_context_manager_cleans_up_tempdir(self) -> None:
+        with RemoteDocumentParser() as parser:
+            tempdir = parser._tempdir
+            assert tempdir.exists()
+        assert not tempdir.exists()
+
+    def test_context_manager_cleans_up_after_exception(self) -> None:
+        tempdir: Path | None = None
+        with pytest.raises(RuntimeError):
+            with RemoteDocumentParser() as parser:
+                tempdir = parser._tempdir
+                raise RuntimeError("boom")
+        assert tempdir is not None
+        assert not tempdir.exists()
+
+
+# ---------------------------------------------------------------------------
+# parse() — happy path
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserParse:
+    def test_parse_returns_text_from_azure(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_client: Mock,
+    ) -> None:
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        assert remote_parser.get_text() == _DEFAULT_TEXT
+
+    def test_parse_sets_archive_path(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_client: Mock,
+    ) -> None:
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        archive = remote_parser.get_archive_path()
+        assert archive is not None
+        assert archive.exists()
+        assert archive.suffix == ".pdf"
+
+    def test_parse_closes_client_on_success(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_client: Mock,
+    ) -> None:
+        remote_parser.configure(ParserContext())
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        azure_client.close.assert_called_once()
+
+    @pytest.mark.usefixtures("no_engine_settings")
+    def test_parse_sets_empty_text_when_not_configured(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        assert remote_parser.get_text() == ""
+        assert remote_parser.get_archive_path() is None
+
+    def test_get_text_none_before_parse(
+        self,
+        remote_parser: RemoteDocumentParser,
+    ) -> None:
+        assert remote_parser.get_text() is None
+
+    def test_get_date_always_none(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_client: Mock,
+    ) -> None:
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        assert remote_parser.get_date() is None
+
+
+# ---------------------------------------------------------------------------
+# parse() — Azure failure path
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserParseError:
+    def test_parse_returns_none_on_azure_error(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        failing_azure_client: Mock,
+    ) -> None:
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        assert remote_parser.get_text() is None
+
+    def test_parse_closes_client_on_error(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        failing_azure_client: Mock,
+    ) -> None:
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        failing_azure_client.close.assert_called_once()
+
+    def test_parse_logs_error_on_azure_failure(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        failing_azure_client: Mock,
+        mocker: MockerFixture,
+    ) -> None:
+        mock_log = mocker.patch("paperless.parsers.remote.logger")
+
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        mock_log.error.assert_called_once()
+        assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
+
+
+# ---------------------------------------------------------------------------
+# get_page_count()
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserPageCount:
+    def test_page_count_for_pdf(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
+        assert isinstance(count, int)
+        assert count >= 1
+
+    def test_page_count_returns_none_for_image_mime(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        count = remote_parser.get_page_count(sample_pdf_file, "image/png")
+        assert count is None
+
+    def test_page_count_returns_none_for_invalid_pdf(
+        self,
+        remote_parser: RemoteDocumentParser,
+        tmp_path: Path,
+    ) -> None:
+        bad_pdf = tmp_path / "bad.pdf"
+        bad_pdf.write_bytes(b"not a pdf at all")
+        count = remote_parser.get_page_count(bad_pdf, "application/pdf")
+        assert count is None
+
+
+# ---------------------------------------------------------------------------
+# extract_metadata()
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserMetadata:
+    def test_extract_metadata_non_pdf_returns_empty(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
+        assert result == []
+
+    def test_extract_metadata_pdf_returns_list(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
+        assert isinstance(result, list)
+
+    def test_extract_metadata_pdf_entries_have_required_keys(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
+        for entry in result:
+            assert "namespace" in entry
+            assert "prefix" in entry
+            assert "key" in entry
+            assert "value" in entry
+            assert isinstance(entry["value"], str)
+
+    def test_extract_metadata_does_not_raise_on_invalid_pdf(
+        self,
+        remote_parser: RemoteDocumentParser,
+        tmp_path: Path,
+    ) -> None:
+        bad_pdf = tmp_path / "bad.pdf"
+        bad_pdf.write_bytes(b"not a pdf at all")
+        result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
+        assert result == []
+
+
+# ---------------------------------------------------------------------------
+# Registry integration
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserRegistry:
+    def test_registered_in_defaults(self) -> None:
+        from paperless.parsers.registry import ParserRegistry
+
+        registry = ParserRegistry()
+        registry.register_defaults()
+
+        assert RemoteDocumentParser in registry._builtins
+
+    @pytest.mark.usefixtures("azure_settings")
+    def test_get_parser_returns_remote_when_configured(self) -> None:
+        from paperless.parsers.registry import get_parser_registry
+
+        registry = get_parser_registry()
+        parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
+
+        assert parser_cls is RemoteDocumentParser
+
+    @pytest.mark.usefixtures("no_engine_settings")
+    def test_get_parser_returns_none_for_unsupported_type_when_not_configured(
+        self,
+    ) -> None:
+        """With remote off and a truly unsupported MIME type, registry returns None."""
+        from paperless.parsers.registry import ParserRegistry
+
+        registry = ParserRegistry()
+        registry.register_defaults()
+        parser_cls = registry.get_parser_for_file(
+            "application/x-unknown-format",
+            "doc.xyz",
+        )
+
+        assert parser_cls is None
--- a/src/paperless/tests/parsers/test_tesseract_custom_settings.py
+++ b/src/paperless/tests/parsers/test_tesseract_custom_settings.py
@@ -10,7 +10,7 @@ from paperless.models import CleanChoices
 from paperless.models import ColorConvertChoices
 from paperless.models import ModeChoices
 from paperless.models import OutputTypeChoices
-from paperless_tesseract.parsers import RasterisedDocumentParser
+from paperless.parsers.tesseract import RasterisedDocumentParser


 class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
--- a/src/paperless/tests/parsers/test_tesseract_parser.py
+++ b/src/paperless/tests/parsers/test_tesseract_parser.py
--- a/src/paperless/tests/parsers/test_text_parser.py
+++ b/src/paperless/tests/parsers/test_text_parser.py
@@ -12,6 +12,7 @@ from pathlib import Path

 import pytest

+from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.text import TextDocumentParser

@@ -93,6 +94,7 @@ class TestTextParserParse:
        text_parser: TextDocumentParser,
        sample_txt_file: Path,
    ) -> None:
+        text_parser.configure(ParserContext())
        text_parser.parse(sample_txt_file, "text/plain")

        assert text_parser.get_text() == "This is a test file.\n"
@@ -102,6 +104,7 @@ class TestTextParserParse:
        text_parser: TextDocumentParser,
        sample_txt_file: Path,
    ) -> None:
+        text_parser.configure(ParserContext())
        text_parser.parse(sample_txt_file, "text/plain")

        assert text_parser.get_archive_path() is None
@@ -111,6 +114,7 @@ class TestTextParserParse:
        text_parser: TextDocumentParser,
        sample_txt_file: Path,
    ) -> None:
+        text_parser.configure(ParserContext())
        text_parser.parse(sample_txt_file, "text/plain")

        assert text_parser.get_date() is None
@@ -129,6 +133,7 @@ class TestTextParserParse:
            - Parsing succeeds
            - Invalid bytes are replaced with the Unicode replacement character
        """
+        text_parser.configure(ParserContext())
        text_parser.parse(malformed_txt_file, "text/plain")

        assert text_parser.get_text() == "Pantothens\ufffdure\n"
@@ -251,6 +256,9 @@ class TestTextParserRegistry:
        from paperless.parsers.registry import get_parser_registry

        registry = get_parser_registry()
-        parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
+        parser_cls = registry.get_parser_for_file(
+            "application/x-unknown-format",
+            "doc.xyz",
+        )

        assert parser_cls is None
--- a/src/paperless/tests/parsers/test_tika_parser.py
+++ b/src/paperless/tests/parsers/test_tika_parser.py
@@ -9,6 +9,7 @@ from pytest_django.fixtures import SettingsWrapper
 from pytest_httpx import HTTPXMock

 from documents.parsers import ParseError
+from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.tika import TikaDocumentParser

@@ -60,6 +61,29 @@ class TestTikaParserRegistryInterface:
    def test_requires_pdf_rendition_is_true(self) -> None:
        assert TikaDocumentParser().requires_pdf_rendition is True

+    def test_get_page_count_returns_none_without_archive(
+        self,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ) -> None:
+        assert (
+            tika_parser.get_page_count(
+                sample_odt_file,
+                "application/vnd.oasis.opendocument.text",
+            )
+            is None
+        )
+
+    def test_get_page_count_returns_int_with_pdf_archive(
+        self,
+        tika_parser: TikaDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        tika_parser._archive_path = sample_pdf_file
+        count = tika_parser.get_page_count(sample_pdf_file, "application/pdf")
+        assert isinstance(count, int)
+        assert count > 0
+

@pytest.mark.django_db()
 class TestTikaParser:
@@ -83,6 +107,7 @@ class TestTikaParser:
        # Pretend convert to PDF response
        httpx_mock.add_response(content=b"PDF document")

+        tika_parser.configure(ParserContext())
        tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")

        assert tika_parser.get_text() == "the content"
--- a/src/paperless/tests/samples/mail/broken.eml
+++ b/src/paperless/tests/samples/mail/broken.eml
--- a/src/paperless/tests/samples/mail/first.pdf
+++ b/src/paperless/tests/samples/mail/first.pdf
--- a/src/paperless/tests/samples/mail/html.eml
+++ b/src/paperless/tests/samples/mail/html.eml
--- a/src/paperless/tests/samples/mail/html.eml.html
+++ b/src/paperless/tests/samples/mail/html.eml.html
--- a/src/paperless/tests/samples/mail/html.eml.pdf
+++ b/src/paperless/tests/samples/mail/html.eml.pdf
--- a/src/paperless/tests/samples/mail/html.eml.pdf.webp
+++ b/src/paperless/tests/samples/mail/html.eml.pdf.webp
--- a/src/paperless/tests/samples/mail/sample.html
+++ b/src/paperless/tests/samples/mail/sample.html
--- a/src/paperless/tests/samples/mail/sample.html.pdf
+++ b/src/paperless/tests/samples/mail/sample.html.pdf
--- a/src/paperless/tests/samples/mail/sample.html.pdf.webp
+++ b/src/paperless/tests/samples/mail/sample.html.pdf.webp
--- a/src/paperless/tests/samples/mail/sample.png
+++ b/src/paperless/tests/samples/mail/sample.png
--- a/src/paperless/tests/samples/mail/second.pdf
+++ b/src/paperless/tests/samples/mail/second.pdf
--- a/src/paperless/tests/samples/mail/simple_text.eml
+++ b/src/paperless/tests/samples/mail/simple_text.eml
--- a/src/paperless/tests/samples/mail/simple_text.eml.pdf
+++ b/src/paperless/tests/samples/mail/simple_text.eml.pdf
--- a/src/paperless/tests/samples/mail/simple_text.eml.pdf.webp
+++ b/src/paperless/tests/samples/mail/simple_text.eml.pdf.webp
--- a/src/paperless/tests/samples/remote/simple-digital.pdf
+++ b/src/paperless/tests/samples/remote/simple-digital.pdf
--- a/src/paperless/tests/samples/tesseract/document.webp
+++ b/src/paperless/tests/samples/tesseract/document.webp
--- a/src/paperless/tests/samples/tesseract/encrypted.pdf
+++ b/src/paperless/tests/samples/tesseract/encrypted.pdf
--- a/src/paperless/tests/samples/tesseract/multi-page-digital.pdf
+++ b/src/paperless/tests/samples/tesseract/multi-page-digital.pdf
--- a/src/paperless/tests/samples/tesseract/multi-page-images-alpha-rgb.tiff
+++ b/src/paperless/tests/samples/tesseract/multi-page-images-alpha-rgb.tiff
--- a/src/paperless/tests/samples/tesseract/multi-page-images-alpha.tiff
+++ b/src/paperless/tests/samples/tesseract/multi-page-images-alpha.tiff
--- a/src/paperless/tests/samples/tesseract/multi-page-images.pdf
+++ b/src/paperless/tests/samples/tesseract/multi-page-images.pdf
--- a/src/paperless/tests/samples/tesseract/multi-page-images.tiff
+++ b/src/paperless/tests/samples/tesseract/multi-page-images.tiff
--- a/src/paperless/tests/samples/tesseract/multi-page-mixed.pdf
+++ b/src/paperless/tests/samples/tesseract/multi-page-mixed.pdf
--- a/src/paperless/tests/samples/tesseract/no-text-alpha.png
+++ b/src/paperless/tests/samples/tesseract/no-text-alpha.png
--- a/src/paperless/tests/samples/tesseract/rotated.pdf
+++ b/src/paperless/tests/samples/tesseract/rotated.pdf
--- a/src/paperless/tests/samples/tesseract/rtl-test.pdf
+++ b/src/paperless/tests/samples/tesseract/rtl-test.pdf
--- a/src/paperless/tests/samples/tesseract/signed.pdf
+++ b/src/paperless/tests/samples/tesseract/signed.pdf
--- a/src/paperless/tests/samples/tesseract/simple-alpha.png
+++ b/src/paperless/tests/samples/tesseract/simple-alpha.png
--- a/src/paperless/tests/samples/tesseract/simple-digital.pdf
+++ b/src/paperless/tests/samples/tesseract/simple-digital.pdf
--- a/src/paperless/tests/samples/tesseract/simple-no-dpi.png
+++ b/src/paperless/tests/samples/tesseract/simple-no-dpi.png
--- a/src/paperless/tests/samples/tesseract/simple.bmp
+++ b/src/paperless/tests/samples/tesseract/simple.bmp
--- a/src/paperless/tests/samples/tesseract/simple.gif
+++ b/src/paperless/tests/samples/tesseract/simple.gif
--- a/src/paperless/tests/samples/tesseract/simple.heic
+++ b/src/paperless/tests/samples/tesseract/simple.heic
--- a/src/paperless/tests/samples/tesseract/simple.jpg
+++ b/src/paperless/tests/samples/tesseract/simple.jpg
--- a/src/paperless/tests/samples/tesseract/simple.png
+++ b/src/paperless/tests/samples/tesseract/simple.png
--- a/src/paperless/tests/samples/tesseract/simple.tif
+++ b/src/paperless/tests/samples/tesseract/simple.tif
--- a/src/paperless/tests/samples/tesseract/single-page-mixed.pdf
+++ b/src/paperless/tests/samples/tesseract/single-page-mixed.pdf
--- a/src/paperless/tests/samples/tesseract/with-form.pdf
+++ b/src/paperless/tests/samples/tesseract/with-form.pdf
--- a/src/paperless/tests/test_checks.py
+++ b/src/paperless/tests/test_checks.py
@@ -5,6 +5,7 @@ from pathlib import Path
 from unittest import mock

 import pytest
+from django.core.checks import ERROR
 from django.core.checks import Error
 from django.core.checks import Warning
 from pytest_django.fixtures import SettingsWrapper
@@ -12,7 +13,9 @@ from pytest_mock import MockerFixture

 from paperless.checks import audit_log_check
 from paperless.checks import binaries_check
+from paperless.checks import check_default_language_available
 from paperless.checks import check_deprecated_db_settings
+from paperless.checks import check_remote_parser_configured
 from paperless.checks import check_v3_minimum_upgrade_version
 from paperless.checks import debug_mode_check
 from paperless.checks import paths_check
@@ -626,3 +629,116 @@ class TestV3MinimumUpgradeVersionCheck:
        conn.introspection.table_names.side_effect = OperationalError("DB unavailable")
        mocker.patch.dict("paperless.checks.connections", {"default": conn})
        assert check_v3_minimum_upgrade_version(None) == []
+
+
+class TestRemoteParserChecks:
+    def test_no_engine(self, settings: SettingsWrapper) -> None:
+        settings.REMOTE_OCR_ENGINE = None
+        msgs = check_remote_parser_configured(None)
+
+        assert len(msgs) == 0
+
+    def test_azure_no_endpoint(self, settings: SettingsWrapper) -> None:
+
+        settings.REMOTE_OCR_ENGINE = "azureai"
+        settings.REMOTE_OCR_API_KEY = "somekey"
+        settings.REMOTE_OCR_ENDPOINT = None
+
+        msgs = check_remote_parser_configured(None)
+
+        assert len(msgs) == 1
+
+        msg = msgs[0]
+
+        assert (
+            "Azure AI remote parser requires endpoint and API key to be configured."
+            in msg.msg
+        )
+
+
+class TestTesseractChecks:
+    def test_default_language(self) -> None:
+        check_default_language_available(None)
+
+    def test_no_language(self, settings: SettingsWrapper) -> None:
+
+        settings.OCR_LANGUAGE = ""
+
+        msgs = check_default_language_available(None)
+
+        assert len(msgs) == 1
+        msg = msgs[0]
+
+        assert (
+            "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE" in msg.msg
+        )
+
+    def test_invalid_language(
+        self,
+        settings: SettingsWrapper,
+        mocker: MockerFixture,
+    ) -> None:
+
+        settings.OCR_LANGUAGE = "ita"
+
+        tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
+        tesser_lang_mock.return_value = ["deu", "eng"]
+
+        msgs = check_default_language_available(None)
+
+        assert len(msgs) == 1
+        msg = msgs[0]
+
+        assert msg.level == ERROR
+        assert "The selected ocr language ita is not installed" in msg.msg
+
+    def test_multi_part_language(
+        self,
+        settings: SettingsWrapper,
+        mocker: MockerFixture,
+    ) -> None:
+        """
+        GIVEN:
+            - An OCR language which is multi part (ie chi-sim)
+            - The language is correctly formatted
+        WHEN:
+            - Installed packages are checked
+        THEN:
+            - No errors are reported
+        """
+
+        settings.OCR_LANGUAGE = "chi_sim"
+
+        tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
+        tesser_lang_mock.return_value = ["chi_sim", "eng"]
+
+        msgs = check_default_language_available(None)
+
+        assert len(msgs) == 0
+
+    def test_multi_part_language_bad_format(
+        self,
+        settings: SettingsWrapper,
+        mocker: MockerFixture,
+    ) -> None:
+        """
+        GIVEN:
+            - An OCR language which is multi part (ie chi-sim)
+            - The language is correctly NOT formatted
+        WHEN:
+            - Installed packages are checked
+        THEN:
+            - No errors are reported
+        """
+        settings.OCR_LANGUAGE = "chi-sim"
+
+        tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
+        tesser_lang_mock.return_value = ["chi_sim", "eng"]
+
+        msgs = check_default_language_available(None)
+
+        assert len(msgs) == 1
+        msg = msgs[0]
+
+        assert msg.level == ERROR
+        assert "The selected ocr language chi-sim is not installed" in msg.msg
--- a/src/paperless/tests/test_registry.py
+++ b/src/paperless/tests/test_registry.py
@@ -18,6 +18,7 @@ from unittest.mock import patch

 import pytest

+from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.registry import ParserRegistry
 from paperless.parsers.registry import get_parser_registry
@@ -103,6 +104,11 @@ def dummy_parser_cls() -> type:
        ) -> list:
            return []

+        def configure(self, context: ParserContext) -> None:
+            """
+            Required to exist, but doesn't need to do anything
+            """
+
        def __enter__(self) -> Self:
            return self

@@ -144,6 +150,7 @@ class TestParserProtocol:
    @pytest.mark.parametrize(
        "missing_method",
        [
+            pytest.param("configure", id="missing-configure"),
            pytest.param("parse", id="missing-parse"),
            pytest.param("get_text", id="missing-get_text"),
            pytest.param("get_thumbnail", id="missing-get_thumbnail"),
--- a/src/paperless_ai/tests/conftest.py
+++ b/src/paperless_ai/tests/conftest.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+
+import pytest
+from pytest_django.fixtures import SettingsWrapper
+
+
+@pytest.fixture
+def temp_llm_index_dir(tmp_path: Path, settings: SettingsWrapper):
+    settings.LLM_INDEX_DIR = tmp_path
+    return tmp_path
--- a/src/paperless_ai/tests/test_ai_indexing.py
+++ b/src/paperless_ai/tests/test_ai_indexing.py
@@ -13,14 +13,6 @@ from documents.models import PaperlessTask
 from paperless_ai import indexing


-@pytest.fixture
-def temp_llm_index_dir(tmp_path):
-    original_dir = indexing.settings.LLM_INDEX_DIR
-    indexing.settings.LLM_INDEX_DIR = tmp_path
-    yield tmp_path
-    indexing.settings.LLM_INDEX_DIR = original_dir
-
-
@pytest.fixture
 def real_document(db):
    return Document.objects.create(
--- a/src/paperless_ai/tests/test_embedding.py
+++ b/src/paperless_ai/tests/test_embedding.py
@@ -3,7 +3,6 @@ from unittest.mock import MagicMock
 from unittest.mock import patch

 import pytest
-from django.conf import settings

 from documents.models import Document
 from paperless.models import LLMEmbeddingBackend
@@ -19,14 +18,6 @@ def mock_ai_config():
        yield MockAIConfig


-@pytest.fixture
-def temp_llm_index_dir(tmp_path):
-    original_dir = settings.LLM_INDEX_DIR
-    settings.LLM_INDEX_DIR = tmp_path
-    yield tmp_path
-    settings.LLM_INDEX_DIR = original_dir
-
-
@pytest.fixture
 def mock_document():
    doc = MagicMock(spec=Document)
--- a/src/paperless_mail/apps.py
+++ b/src/paperless_mail/apps.py
@@ -1,18 +1,8 @@
 from django.apps import AppConfig
-from django.conf import settings
 from django.utils.translation import gettext_lazy as _

-from paperless_mail.signals import mail_consumer_declaration
-

 class PaperlessMailConfig(AppConfig):
    name = "paperless_mail"

    verbose_name = _("Paperless mail")
-
-    def ready(self) -> None:
-        from documents.signals import document_consumer_declaration
-
-        if settings.TIKA_ENABLED:
-            document_consumer_declaration.connect(mail_consumer_declaration)
-        AppConfig.ready(self)
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -1,481 +0,0 @@
-import re
-from html import escape
-from pathlib import Path
-
-from bleach import clean
-from bleach import linkify
-from django.conf import settings
-from django.utils import timezone
-from django.utils.timezone import is_naive
-from django.utils.timezone import make_aware
-from gotenberg_client import GotenbergClient
-from gotenberg_client.constants import A4
-from gotenberg_client.options import Measurement
-from gotenberg_client.options import MeasurementUnitType
-from gotenberg_client.options import PageMarginsType
-from gotenberg_client.options import PdfAFormat
-from humanize import naturalsize
-from imap_tools import MailAttachment
-from imap_tools import MailMessage
-from tika_client import TikaClient
-
-from documents.parsers import DocumentParser
-from documents.parsers import ParseError
-from documents.parsers import make_thumbnail_from_pdf
-from paperless.models import OutputTypeChoices
-from paperless_mail.models import MailRule
-
-
-class MailDocumentParser(DocumentParser):
-    """
-    This parser uses imap_tools to parse .eml files, generates pdf using
-    Gotenberg and sends the html part to a Tika server for text extraction.
-    """
-
-    logging_name = "paperless.parsing.mail"
-
-    def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
-        """
-        Converts our requested PDF/A output into the Gotenberg API
-        format
-        """
-        if settings.OCR_OUTPUT_TYPE in {
-            OutputTypeChoices.PDF_A,
-            OutputTypeChoices.PDF_A2,
-        }:
-            return PdfAFormat.A2b
-        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:  # pragma: no cover
-            self.log.warning(
-                "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
-            )
-            return PdfAFormat.A2b
-        elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:  # pragma: no cover
-            return PdfAFormat.A3b
-        return None
-
-    def get_thumbnail(
-        self,
-        document_path: Path,
-        mime_type: str,
-        file_name=None,
-    ) -> Path:
-        if not self.archive_path:
-            self.archive_path = self.generate_pdf(
-                self.parse_file_to_message(document_path),
-            )
-
-        return make_thumbnail_from_pdf(
-            self.archive_path,
-            self.tempdir,
-            self.logging_group,
-        )
-
-    def extract_metadata(self, document_path: Path, mime_type: str):
-        result = []
-
-        try:
-            mail = self.parse_file_to_message(document_path)
-        except ParseError as e:
-            self.log.warning(
-                f"Error while fetching document metadata for {document_path}: {e}",
-            )
-            return result
-
-        for key, value in mail.headers.items():
-            value = ", ".join(i for i in value)
-            try:
-                value.encode("utf-8")
-            except UnicodeEncodeError as e:  # pragma: no cover
-                self.log.debug(f"Skipping header {key}: {e}")
-                continue
-
-            result.append(
-                {
-                    "namespace": "",
-                    "prefix": "header",
-                    "key": key,
-                    "value": value,
-                },
-            )
-
-        result.append(
-            {
-                "namespace": "",
-                "prefix": "",
-                "key": "attachments",
-                "value": ", ".join(
-                    f"{attachment.filename}"
-                    f"({naturalsize(attachment.size, binary=True, format='%.2f')})"
-                    for attachment in mail.attachments
-                ),
-            },
-        )
-
-        result.append(
-            {
-                "namespace": "",
-                "prefix": "",
-                "key": "date",
-                "value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
-            },
-        )
-
-        result.sort(key=lambda item: (item["prefix"], item["key"]))
-        return result
-
-    def parse(
-        self,
-        document_path: Path,
-        mime_type: str,
-        file_name=None,
-        mailrule_id: int | None = None,
-    ) -> None:
-        """
-        Parses the given .eml into formatted text, based on the decoded email.
-
-        """
-
-        def strip_text(text: str):
-            """
-            Reduces the spacing of the given text string
-            """
-            text = re.sub(r"\s+", " ", text)
-            text = re.sub(r"(\n *)+", "\n", text)
-            return text.strip()
-
-        def build_formatted_text(mail_message: MailMessage) -> str:
-            """
-            Constructs a formatted string, based on the given email.  Basically tries
-            to get most of the email content, included front matter, into a nice string
-            """
-            fmt_text = f"Subject: {mail_message.subject}\n\n"
-            fmt_text += f"From: {mail_message.from_values.full}\n\n"
-            to_list = [address.full for address in mail_message.to_values]
-            fmt_text += f"To: {', '.join(to_list)}\n\n"
-            if mail_message.cc_values:
-                fmt_text += (
-                    f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
-                )
-            if mail_message.bcc_values:
-                fmt_text += (
-                    f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
-                )
-            if mail_message.attachments:
-                att = []
-                for a in mail.attachments:
-                    attachment_size = naturalsize(a.size, binary=True, format="%.2f")
-                    att.append(
-                        f"{a.filename} ({attachment_size})",
-                    )
-                fmt_text += f"Attachments: {', '.join(att)}\n\n"
-
-            if mail.html:
-                fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
-
-            fmt_text += f"\n\n{strip_text(mail.text)}"
-
-            return fmt_text
-
-        self.log.debug(f"Parsing file {document_path.name} into an email")
-        mail = self.parse_file_to_message(document_path)
-
-        self.log.debug("Building formatted text from email")
-        self.text = build_formatted_text(mail)
-
-        if is_naive(mail.date):
-            self.date = make_aware(mail.date)
-        else:
-            self.date = mail.date
-
-        self.log.debug("Creating a PDF from the email")
-        if mailrule_id:
-            rule = MailRule.objects.get(pk=mailrule_id)
-            self.archive_path = self.generate_pdf(mail, rule.pdf_layout)
-        else:
-            self.archive_path = self.generate_pdf(mail)
-
-    @staticmethod
-    def parse_file_to_message(filepath: Path) -> MailMessage:
-        """
-        Parses the given .eml file into a MailMessage object
-        """
-        try:
-            with filepath.open("rb") as eml:
-                parsed = MailMessage.from_bytes(eml.read())
-                if parsed.from_values is None:
-                    raise ParseError(
-                        f"Could not parse {filepath}: Missing 'from'",
-                    )
-        except Exception as err:
-            raise ParseError(
-                f"Could not parse {filepath}: {err}",
-            ) from err
-
-        return parsed
-
-    def tika_parse(self, html: str):
-        self.log.info("Sending content to Tika server")
-
-        try:
-            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
-                parsed = client.tika.as_text.from_buffer(html, "text/html")
-
-                if parsed.content is not None:
-                    return parsed.content.strip()
-                return ""
-        except Exception as err:
-            raise ParseError(
-                f"Could not parse content with tika server at "
-                f"{settings.TIKA_ENDPOINT}: {err}",
-            ) from err
-
-    def generate_pdf(
-        self,
-        mail_message: MailMessage,
-        pdf_layout: MailRule.PdfLayout | None = None,
-    ) -> Path:
-        archive_path = Path(self.tempdir) / "merged.pdf"
-
-        mail_pdf_file = self.generate_pdf_from_mail(mail_message)
-
-        pdf_layout = (
-            pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
-        )  # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout
-
-        # If no HTML content, create the PDF from the message
-        # Otherwise, create 2 PDFs and merge them with Gotenberg
-        if not mail_message.html:
-            archive_path.write_bytes(mail_pdf_file.read_bytes())
-        else:
-            pdf_of_html_content = self.generate_pdf_from_html(
-                mail_message.html,
-                mail_message.attachments,
-            )
-
-            self.log.debug("Merging email text and HTML content into single PDF")
-
-            with (
-                GotenbergClient(
-                    host=settings.TIKA_GOTENBERG_ENDPOINT,
-                    timeout=settings.CELERY_TASK_TIME_LIMIT,
-                ) as client,
-                client.merge.merge() as route,
-            ):
-                # Configure requested PDF/A formatting, if any
-                pdf_a_format = self._settings_to_gotenberg_pdfa()
-                if pdf_a_format is not None:
-                    route.pdf_format(pdf_a_format)
-
-                match pdf_layout:
-                    case MailRule.PdfLayout.HTML_TEXT:
-                        route.merge([pdf_of_html_content, mail_pdf_file])
-                    case MailRule.PdfLayout.HTML_ONLY:
-                        route.merge([pdf_of_html_content])
-                    case MailRule.PdfLayout.TEXT_ONLY:
-                        route.merge([mail_pdf_file])
-                    case MailRule.PdfLayout.TEXT_HTML | _:
-                        route.merge([mail_pdf_file, pdf_of_html_content])
-
-                try:
-                    response = route.run()
-                    archive_path.write_bytes(response.content)
-                except Exception as err:
-                    raise ParseError(
-                        f"Error while merging email HTML into PDF: {err}",
-                    ) from err
-
-        return archive_path
-
-    def mail_to_html(self, mail: MailMessage) -> Path:
-        """
-        Converts the given email into an HTML file, formatted
-        based on the given template
-        """
-
-        def clean_html(text: str) -> str:
-            """
-            Attempts to clean, escape and linkify the given HTML string
-            """
-            if isinstance(text, list):
-                text = "\n".join([str(e) for e in text])
-            if not isinstance(text, str):
-                text = str(text)
-            text = escape(text)
-            text = clean(text)
-            text = linkify(text, parse_email=True)
-            text = text.replace("\n", "<br>")
-            return text
-
-        data = {}
-
-        data["subject"] = clean_html(mail.subject)
-        if data["subject"]:
-            data["subject_label"] = "Subject"
-        data["from"] = clean_html(mail.from_values.full)
-        if data["from"]:
-            data["from_label"] = "From"
-        data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
-        if data["to"]:
-            data["to_label"] = "To"
-        data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
-        if data["cc"]:
-            data["cc_label"] = "CC"
-        data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
-        if data["bcc"]:
-            data["bcc_label"] = "BCC"
-
-        att = []
-        for a in mail.attachments:
-            att.append(
-                f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
-            )
-        data["attachments"] = clean_html(", ".join(att))
-        if data["attachments"]:
-            data["attachments_label"] = "Attachments"
-
-        data["date"] = clean_html(
-            timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
-        )
-        data["content"] = clean_html(mail.text.strip())
-
-        from django.template.loader import render_to_string
-
-        html_file = Path(self.tempdir) / "email_as_html.html"
-        html_file.write_text(render_to_string("email_msg_template.html", context=data))
-
-        return html_file
-
-    def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
-        """
-        Creates a PDF based on the given email, using the email's values in a
-        an HTML template
-        """
-        self.log.info("Converting mail to PDF")
-
-        css_file = Path(__file__).parent / "templates" / "output.css"
-        email_html_file = self.mail_to_html(mail)
-
-        with (
-            GotenbergClient(
-                host=settings.TIKA_GOTENBERG_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client,
-            client.chromium.html_to_pdf() as route,
-        ):
-            # Configure requested PDF/A formatting, if any
-            pdf_a_format = self._settings_to_gotenberg_pdfa()
-            if pdf_a_format is not None:
-                route.pdf_format(pdf_a_format)
-
-            try:
-                response = (
-                    route.index(email_html_file)
-                    .resource(css_file)
-                    .margins(
-                        PageMarginsType(
-                            top=Measurement(0.1, MeasurementUnitType.Inches),
-                            bottom=Measurement(0.1, MeasurementUnitType.Inches),
-                            left=Measurement(0.1, MeasurementUnitType.Inches),
-                            right=Measurement(0.1, MeasurementUnitType.Inches),
-                        ),
-                    )
-                    .size(A4)
-                    .scale(1.0)
-                    .run()
-                )
-            except Exception as err:
-                raise ParseError(
-                    f"Error while converting email to PDF: {err}",
-                ) from err
-
-        email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
-        email_as_pdf_file.write_bytes(response.content)
-
-        return email_as_pdf_file
-
-    def generate_pdf_from_html(
-        self,
-        orig_html: str,
-        attachments: list[MailAttachment],
-    ) -> Path:
-        """
-        Generates a PDF file based on the HTML and attachments of the email
-        """
-
-        def clean_html_script(text: str):
-            compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
-            text = compiled_open.sub("<div hidden ", text)
-
-            compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
-            text = compiled_close.sub("</div", text)
-            return text
-
-        self.log.info("Converting message html to PDF")
-
-        tempdir = Path(self.tempdir)
-
-        html_clean = clean_html_script(orig_html)
-        html_clean_file = tempdir / "index.html"
-        html_clean_file.write_text(html_clean)
-
-        with (
-            GotenbergClient(
-                host=settings.TIKA_GOTENBERG_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client,
-            client.chromium.html_to_pdf() as route,
-        ):
-            # Configure requested PDF/A formatting, if any
-            pdf_a_format = self._settings_to_gotenberg_pdfa()
-            if pdf_a_format is not None:
-                route.pdf_format(pdf_a_format)
-
-            # Add attachments as resources, cleaning the filename and replacing
-            # it in the index file for inclusion
-            for attachment in attachments:
-                # Clean the attachment name to be valid
-                name_cid = f"cid:{attachment.content_id}"
-                name_clean = "".join(e for e in name_cid if e.isalnum())
-
-                # Write attachment payload to a temp file
-                temp_file = tempdir / name_clean
-                temp_file.write_bytes(attachment.payload)
-
-                route.resource(temp_file)
-
-                # Replace as needed the name with the clean name
-                html_clean = html_clean.replace(name_cid, name_clean)
-
-            # Now store the cleaned up HTML version
-            html_clean_file = tempdir / "index.html"
-            html_clean_file.write_text(html_clean)
-            # This is our index file, the main page basically
-            route.index(html_clean_file)
-
-            # Set page size, margins
-            route.margins(
-                PageMarginsType(
-                    top=Measurement(0.1, MeasurementUnitType.Inches),
-                    bottom=Measurement(0.1, MeasurementUnitType.Inches),
-                    left=Measurement(0.1, MeasurementUnitType.Inches),
-                    right=Measurement(0.1, MeasurementUnitType.Inches),
-                ),
-            ).size(A4).scale(1.0)
-
-            try:
-                response = route.run()
-
-            except Exception as err:
-                raise ParseError(
-                    f"Error while converting document to PDF: {err}",
-                ) from err
-
-        html_pdf = tempdir / "html.pdf"
-        html_pdf.write_bytes(response.content)
-        return html_pdf
-
-    def get_settings(self) -> None:
-        """
-        This parser does not implement additional settings yet
-        """
-        return None
--- a/src/paperless_mail/signals.py
+++ b/src/paperless_mail/signals.py
@@ -1,14 +0,0 @@
-def get_parser(*args, **kwargs):
-    from paperless_mail.parsers import MailDocumentParser
-
-    return MailDocumentParser(*args, **kwargs)
-
-
-def mail_consumer_declaration(sender, **kwargs):
-    return {
-        "parser": get_parser,
-        "weight": 20,
-        "mime_types": {
-            "message/rfc822": ".eml",
-        },
-    }
--- a/src/paperless_mail/tests/conftest.py
+++ b/src/paperless_mail/tests/conftest.py
@@ -1,71 +1,9 @@
 from collections.abc import Generator
-from pathlib import Path

 import pytest

 from paperless_mail.mail import MailAccountHandler
 from paperless_mail.models import MailAccount
-from paperless_mail.parsers import MailDocumentParser
-
-
-@pytest.fixture(scope="session")
-def sample_dir() -> Path:
-    return (Path(__file__).parent / Path("samples")).resolve()
-
-
-@pytest.fixture(scope="session")
-def broken_email_file(sample_dir: Path) -> Path:
-    return sample_dir / "broken.eml"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_file(sample_dir: Path) -> Path:
-    return sample_dir / "simple_text.eml"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
-    return sample_dir / "simple_text.eml.pdf"
-
-
-@pytest.fixture(scope="session")
-def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
-    return sample_dir / "simple_text.eml.pdf.webp"
-
-
-@pytest.fixture(scope="session")
-def html_email_file(sample_dir: Path) -> Path:
-    return sample_dir / "html.eml"
-
-
-@pytest.fixture(scope="session")
-def html_email_pdf_file(sample_dir: Path) -> Path:
-    return sample_dir / "html.eml.pdf"
-
-
-@pytest.fixture(scope="session")
-def html_email_thumbnail_file(sample_dir: Path) -> Path:
-    return sample_dir / "html.eml.pdf.webp"
-
-
-@pytest.fixture(scope="session")
-def html_email_html_file(sample_dir: Path) -> Path:
-    return sample_dir / "html.eml.html"
-
-
-@pytest.fixture(scope="session")
-def merged_pdf_first(sample_dir: Path) -> Path:
-    return sample_dir / "first.pdf"
-
-
-@pytest.fixture(scope="session")
-def merged_pdf_second(sample_dir: Path) -> Path:
-    return sample_dir / "second.pdf"
-
-
-@pytest.fixture()
-def mail_parser() -> MailDocumentParser:
-    return MailDocumentParser(logging_group=None)


@pytest.fixture()
@@ -89,11 +27,3 @@ def greenmail_mail_account(db: None) -> Generator[MailAccount, None, None]:
@pytest.fixture()
 def mail_account_handler() -> MailAccountHandler:
    return MailAccountHandler()
-
-
-@pytest.fixture(scope="session")
-def nginx_base_url() -> Generator[str, None, None]:
-    """
-    The base URL for the nginx HTTP server we expect to be alive
-    """
-    yield "http://localhost:8080"
--- a/src/paperless_mail/tests/test_mail_oauth.py
+++ b/src/paperless_mail/tests/test_mail_oauth.py
@@ -1,7 +1,6 @@
 from datetime import timedelta
 from unittest import mock

-from django.conf import settings
 from django.contrib.auth.models import Permission
 from django.contrib.auth.models import User
 from django.test import TestCase
@@ -16,6 +15,13 @@ from paperless_mail.models import MailAccount
 from paperless_mail.oauth import PaperlessMailOAuth2Manager


+@override_settings(
+    OAUTH_CALLBACK_BASE_URL="http://localhost:8000",
+    GMAIL_OAUTH_CLIENT_ID="test_gmail_client_id",
+    GMAIL_OAUTH_CLIENT_SECRET="test_gmail_client_secret",
+    OUTLOOK_OAUTH_CLIENT_ID="test_outlook_client_id",
+    OUTLOOK_OAUTH_CLIENT_SECRET="test_outlook_client_secret",
+)
 class TestMailOAuth(
    TestCase,
 ):
@@ -31,12 +37,6 @@ class TestMailOAuth(
        self.user.save()
        self.client.force_login(self.user)
        self.mail_account_handler = MailAccountHandler()
-        # Mock settings
-        settings.OAUTH_CALLBACK_BASE_URL = "http://localhost:8000"
-        settings.GMAIL_OAUTH_CLIENT_ID = "test_gmail_client_id"
-        settings.GMAIL_OAUTH_CLIENT_SECRET = "test_gmail_client_secret"
-        settings.OUTLOOK_OAUTH_CLIENT_ID = "test_outlook_client_id"
-        settings.OUTLOOK_OAUTH_CLIENT_SECRET = "test_outlook_client_secret"
        super().setUp()

    def test_generate_paths(self) -> None:
--- a/src/paperless_remote/init.py
+++ b/src/paperless_remote/init.py
@@ -1,4 +0,0 @@
-# this is here so that django finds the checks.
-from paperless_remote.checks import check_remote_parser_configured
-
-__all__ = ["check_remote_parser_configured"]
--- a/src/paperless_remote/apps.py
+++ b/src/paperless_remote/apps.py
@@ -1,14 +0,0 @@
-from django.apps import AppConfig
-
-from paperless_remote.signals import remote_consumer_declaration
-
-
-class PaperlessRemoteParserConfig(AppConfig):
-    name = "paperless_remote"
-
-    def ready(self) -> None:
-        from documents.signals import document_consumer_declaration
-
-        document_consumer_declaration.connect(remote_consumer_declaration)
-
-        AppConfig.ready(self)
--- a/src/paperless_remote/checks.py
+++ b/src/paperless_remote/checks.py
@@ -1,17 +0,0 @@
-from django.conf import settings
-from django.core.checks import Error
-from django.core.checks import register
-
-
-@register()
-def check_remote_parser_configured(app_configs, **kwargs):
-    if settings.REMOTE_OCR_ENGINE == "azureai" and not (
-        settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
-    ):
-        return [
-            Error(
-                "Azure AI remote parser requires endpoint and API key to be configured.",
-            ),
-        ]
-
-    return []
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@@ -1,118 +0,0 @@
-from pathlib import Path
-
-from django.conf import settings
-
-from paperless_tesseract.parsers import RasterisedDocumentParser
-
-
-class RemoteEngineConfig:
-    def __init__(
-        self,
-        engine: str,
-        api_key: str | None = None,
-        endpoint: str | None = None,
-    ):
-        self.engine = engine
-        self.api_key = api_key
-        self.endpoint = endpoint
-
-    def engine_is_valid(self):
-        valid = self.engine in ["azureai"] and self.api_key is not None
-        if self.engine == "azureai":
-            valid = valid and self.endpoint is not None
-        return valid
-
-
-class RemoteDocumentParser(RasterisedDocumentParser):
-    """
-    This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
-    as this is the only service that provides a remote OCR API with text-embedded PDF output.
-    """
-
-    logging_name = "paperless.parsing.remote"
-
-    def get_settings(self) -> RemoteEngineConfig:
-        """
-        Returns the configuration for the remote OCR engine, loaded from Django settings.
-        """
-        return RemoteEngineConfig(
-            engine=settings.REMOTE_OCR_ENGINE,
-            api_key=settings.REMOTE_OCR_API_KEY,
-            endpoint=settings.REMOTE_OCR_ENDPOINT,
-        )
-
-    def supported_mime_types(self):
-        if self.settings.engine_is_valid():
-            return {
-                "application/pdf": ".pdf",
-                "image/png": ".png",
-                "image/jpeg": ".jpg",
-                "image/tiff": ".tiff",
-                "image/bmp": ".bmp",
-                "image/gif": ".gif",
-                "image/webp": ".webp",
-            }
-        else:
-            return {}
-
-    def azure_ai_vision_parse(
-        self,
-        file: Path,
-    ) -> str | None:
-        """
-        Uses Azure AI Vision to parse the document and return the text content.
-        It requests a searchable PDF output with embedded text.
-        The PDF is saved to the archive_path attribute.
-        Returns the text content extracted from the document.
-        If the parsing fails, it returns None.
-        """
-        from azure.ai.documentintelligence import DocumentIntelligenceClient
-        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
-        from azure.ai.documentintelligence.models import AnalyzeOutputOption
-        from azure.ai.documentintelligence.models import DocumentContentFormat
-        from azure.core.credentials import AzureKeyCredential
-
-        client = DocumentIntelligenceClient(
-            endpoint=self.settings.endpoint,
-            credential=AzureKeyCredential(self.settings.api_key),
-        )
-
-        try:
-            with file.open("rb") as f:
-                analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
-                poller = client.begin_analyze_document(
-                    model_id="prebuilt-read",
-                    body=analyze_request,
-                    output_content_format=DocumentContentFormat.TEXT,
-                    output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
-                    content_type="application/json",
-                )
-
-            poller.wait()
-            result_id = poller.details["operation_id"]
-            result = poller.result()
-
-            # Download the PDF with embedded text
-            self.archive_path = self.tempdir / "archive.pdf"
-            with self.archive_path.open("wb") as f:
-                for chunk in client.get_analyze_result_pdf(
-                    model_id="prebuilt-read",
-                    result_id=result_id,
-                ):
-                    f.write(chunk)
-            return result.content
-        except Exception as e:
-            self.log.error(f"Azure AI Vision parsing failed: {e}")
-        finally:
-            client.close()
-
-        return None
-
-    def parse(self, document_path: Path, mime_type, file_name=None):
-        if not self.settings.engine_is_valid():
-            self.log.warning(
-                "No valid remote parser engine is configured, content will be empty.",
-            )
-            self.text = ""
-        elif self.settings.engine == "azureai":
-            self.text = self.azure_ai_vision_parse(document_path)
--- a/src/paperless_remote/signals.py
+++ b/src/paperless_remote/signals.py
@@ -1,18 +0,0 @@
-def get_parser(*args, **kwargs):
-    from paperless_remote.parsers import RemoteDocumentParser
-
-    return RemoteDocumentParser(*args, **kwargs)
-
-
-def get_supported_mime_types():
-    from paperless_remote.parsers import RemoteDocumentParser
-
-    return RemoteDocumentParser(None).supported_mime_types()
-
-
-def remote_consumer_declaration(sender, **kwargs):
-    return {
-        "parser": get_parser,
-        "weight": 5,
-        "mime_types": get_supported_mime_types(),
-    }
--- a/src/paperless_remote/tests/init.py
+++ b/src/paperless_remote/tests/init.py
--- a/src/paperless_remote/tests/test_checks.py
+++ b/src/paperless_remote/tests/test_checks.py
@@ -1,24 +0,0 @@
-from unittest import TestCase
-
-from django.test import override_settings
-
-from paperless_remote import check_remote_parser_configured
-
-
-class TestChecks(TestCase):
-    @override_settings(REMOTE_OCR_ENGINE=None)
-    def test_no_engine(self) -> None:
-        msgs = check_remote_parser_configured(None)
-        self.assertEqual(len(msgs), 0)
-
-    @override_settings(REMOTE_OCR_ENGINE="azureai")
-    @override_settings(REMOTE_OCR_API_KEY="somekey")
-    @override_settings(REMOTE_OCR_ENDPOINT=None)
-    def test_azure_no_endpoint(self) -> None:
-        msgs = check_remote_parser_configured(None)
-        self.assertEqual(len(msgs), 1)
-        self.assertTrue(
-            msgs[0].msg.startswith(
-                "Azure AI remote parser requires endpoint and API key to be configured.",
-            ),
-        )
--- a/src/paperless_remote/tests/test_parser.py
+++ b/src/paperless_remote/tests/test_parser.py
@@ -1,131 +0,0 @@
-import uuid
-from pathlib import Path
-from unittest import mock
-
-from django.test import TestCase
-from django.test import override_settings
-
-from documents.tests.utils import DirectoriesMixin
-from documents.tests.utils import FileSystemAssertsMixin
-from paperless_remote.parsers import RemoteDocumentParser
-from paperless_remote.signals import get_parser
-
-
-class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
-    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
-
-    def assertContainsStrings(self, content: str, strings: list[str]) -> None:
-        # Asserts that all strings appear in content, in the given order.
-        indices = []
-        for s in strings:
-            if s in content:
-                indices.append(content.index(s))
-            else:
-                self.fail(f"'{s}' is not in '{content}'")
-        self.assertListEqual(indices, sorted(indices))
-
-    @mock.patch("paperless_tesseract.parsers.run_subprocess")
-    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
-    def test_get_text_with_azure(self, mock_client_cls, mock_subprocess) -> None:
-        # Arrange mock Azure client
-        mock_client = mock.Mock()
-        mock_client_cls.return_value = mock_client
-
-        # Simulate poller result and its `.details`
-        mock_poller = mock.Mock()
-        mock_poller.wait.return_value = None
-        mock_poller.details = {"operation_id": "fake-op-id"}
-        mock_client.begin_analyze_document.return_value = mock_poller
-        mock_poller.result.return_value.content = "This is a test document."
-
-        # Return dummy PDF bytes
-        mock_client.get_analyze_result_pdf.return_value = [
-            b"%PDF-",
-            b"1.7 ",
-            b"FAKEPDF",
-        ]
-
-        # Simulate pdftotext by writing dummy text to sidecar file
-        def fake_run(cmd, *args, **kwargs) -> None:
-            with Path(cmd[-1]).open("w", encoding="utf-8") as f:
-                f.write("This is a test document.")
-
-        mock_subprocess.side_effect = fake_run
-
-        with override_settings(
-            REMOTE_OCR_ENGINE="azureai",
-            REMOTE_OCR_API_KEY="somekey",
-            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
-        ):
-            parser = get_parser(uuid.uuid4())
-            parser.parse(
-                self.SAMPLE_FILES / "simple-digital.pdf",
-                "application/pdf",
-            )
-
-            self.assertContainsStrings(
-                parser.text.strip(),
-                ["This is a test document."],
-            )
-
-    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
-    def test_get_text_with_azure_error_logged_and_returns_none(
-        self,
-        mock_client_cls,
-    ) -> None:
-        mock_client = mock.Mock()
-        mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
-        mock_client_cls.return_value = mock_client
-
-        with override_settings(
-            REMOTE_OCR_ENGINE="azureai",
-            REMOTE_OCR_API_KEY="somekey",
-            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
-        ):
-            parser = get_parser(uuid.uuid4())
-            with mock.patch.object(parser.log, "error") as mock_log_error:
-                parser.parse(
-                    self.SAMPLE_FILES / "simple-digital.pdf",
-                    "application/pdf",
-                )
-
-        self.assertIsNone(parser.text)
-        mock_client.begin_analyze_document.assert_called_once()
-        mock_client.close.assert_called_once()
-        mock_log_error.assert_called_once()
-        self.assertIn(
-            "Azure AI Vision parsing failed",
-            mock_log_error.call_args[0][0],
-        )
-
-    @override_settings(
-        REMOTE_OCR_ENGINE="azureai",
-        REMOTE_OCR_API_KEY="key",
-        REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
-    )
-    def test_supported_mime_types_valid_config(self) -> None:
-        parser = RemoteDocumentParser(uuid.uuid4())
-        expected_types = {
-            "application/pdf": ".pdf",
-            "image/png": ".png",
-            "image/jpeg": ".jpg",
-            "image/tiff": ".tiff",
-            "image/bmp": ".bmp",
-            "image/gif": ".gif",
-            "image/webp": ".webp",
-        }
-        self.assertEqual(parser.supported_mime_types(), expected_types)
-
-    def test_supported_mime_types_invalid_config(self) -> None:
-        parser = get_parser(uuid.uuid4())
-        self.assertEqual(parser.supported_mime_types(), {})
-
-    @override_settings(
-        REMOTE_OCR_ENGINE=None,
-        REMOTE_OCR_API_KEY=None,
-        REMOTE_OCR_ENDPOINT=None,
-    )
-    def test_parse_with_invalid_config(self) -> None:
-        parser = get_parser(uuid.uuid4())
-        parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
-        self.assertEqual(parser.text, "")
--- a/src/paperless_tesseract/init.py
+++ b/src/paperless_tesseract/init.py
@@ -1,5 +0,0 @@
-# this is here so that django finds the checks.
-from paperless_tesseract.checks import check_default_language_available
-from paperless_tesseract.checks import get_tesseract_langs
-
-__all__ = ["check_default_language_available", "get_tesseract_langs"]
--- a/src/paperless_tesseract/apps.py
+++ b/src/paperless_tesseract/apps.py
@@ -1,14 +0,0 @@
-from django.apps import AppConfig
-
-from paperless_tesseract.signals import tesseract_consumer_declaration
-
-
-class PaperlessTesseractConfig(AppConfig):
-    name = "paperless_tesseract"
-
-    def ready(self) -> None:
-        from documents.signals import document_consumer_declaration
-
-        document_consumer_declaration.connect(tesseract_consumer_declaration)
-
-        AppConfig.ready(self)
--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@@ -1,52 +0,0 @@
-import shutil
-import subprocess
-
-from django.conf import settings
-from django.core.checks import Error
-from django.core.checks import Warning
-from django.core.checks import register
-
-
-def get_tesseract_langs():
-    proc = subprocess.run(
-        [shutil.which("tesseract"), "--list-langs"],
-        capture_output=True,
-    )
-
-    # Decode bytes to string, split on newlines, trim out the header
-    proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
-
-    return [x.strip() for x in proc_lines]
-
-
-@register()
-def check_default_language_available(app_configs, **kwargs):
-    errs = []
-
-    if not settings.OCR_LANGUAGE:
-        errs.append(
-            Warning(
-                "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
-                "This means that tesseract will fallback to english.",
-            ),
-        )
-        return errs
-
-    # binaries_check in paperless will check and report if this doesn't exist
-    # So skip trying to do anything here and let that handle missing binaries
-    if shutil.which("tesseract") is not None:
-        installed_langs = get_tesseract_langs()
-
-        specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")]
-
-        for lang in specified_langs:
-            if lang not in installed_langs:
-                errs.append(
-                    Error(
-                        f"The selected ocr language {lang} is "
-                        f"not installed. Paperless cannot OCR your documents "
-                        f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
-                    ),
-                )
-
-    return errs
--- a/src/paperless_tesseract/signals.py
+++ b/src/paperless_tesseract/signals.py
@@ -1,21 +0,0 @@
-def get_parser(*args, **kwargs):
-    from paperless_tesseract.parsers import RasterisedDocumentParser
-
-    return RasterisedDocumentParser(*args, **kwargs)
-
-
-def tesseract_consumer_declaration(sender, **kwargs):
-    return {
-        "parser": get_parser,
-        "weight": 0,
-        "mime_types": {
-            "application/pdf": ".pdf",
-            "image/jpeg": ".jpg",
-            "image/png": ".png",
-            "image/tiff": ".tif",
-            "image/gif": ".gif",
-            "image/bmp": ".bmp",
-            "image/webp": ".webp",
-            "image/heic": ".heic",
-        },
-    }
--- a/src/paperless_tesseract/tests/init.py
+++ b/src/paperless_tesseract/tests/init.py
--- a/src/paperless_tesseract/tests/test_checks.py
+++ b/src/paperless_tesseract/tests/test_checks.py
@@ -1,67 +0,0 @@
-from unittest import mock
-
-from django.core.checks import ERROR
-from django.test import TestCase
-from django.test import override_settings
-
-from paperless_tesseract import check_default_language_available
-
-
-class TestChecks(TestCase):
-    def test_default_language(self) -> None:
-        check_default_language_available(None)
-
-    @override_settings(OCR_LANGUAGE="")
-    def test_no_language(self) -> None:
-        msgs = check_default_language_available(None)
-        self.assertEqual(len(msgs), 1)
-        self.assertTrue(
-            msgs[0].msg.startswith(
-                "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE",
-            ),
-        )
-
-    @override_settings(OCR_LANGUAGE="ita")
-    @mock.patch("paperless_tesseract.checks.get_tesseract_langs")
-    def test_invalid_language(self, m) -> None:
-        m.return_value = ["deu", "eng"]
-        msgs = check_default_language_available(None)
-        self.assertEqual(len(msgs), 1)
-        self.assertEqual(msgs[0].level, ERROR)
-
-    @override_settings(OCR_LANGUAGE="chi_sim")
-    @mock.patch("paperless_tesseract.checks.get_tesseract_langs")
-    def test_multi_part_language(self, m) -> None:
-        """
-        GIVEN:
-            - An OCR language which is multi part (ie chi-sim)
-            - The language is correctly formatted
-        WHEN:
-            - Installed packages are checked
-        THEN:
-            - No errors are reported
-        """
-        m.return_value = ["chi_sim", "eng"]
-
-        msgs = check_default_language_available(None)
-
-        self.assertEqual(len(msgs), 0)
-
-    @override_settings(OCR_LANGUAGE="chi-sim")
-    @mock.patch("paperless_tesseract.checks.get_tesseract_langs")
-    def test_multi_part_language_bad_format(self, m) -> None:
-        """
-        GIVEN:
-            - An OCR language which is multi part (ie chi-sim)
-            - The language is correctly NOT formatted
-        WHEN:
-            - Installed packages are checked
-        THEN:
-            - No errors are reported
-        """
-        m.return_value = ["chi_sim", "eng"]
-
-        msgs = check_default_language_available(None)
-
-        self.assertEqual(len(msgs), 1)
-        self.assertEqual(msgs[0].level, ERROR)
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -1,924 +0,0 @@
-import shutil
-import tempfile
-import unicodedata
-import uuid
-from pathlib import Path
-from unittest import mock
-
-from django.test import TestCase
-from django.test import override_settings
-from ocrmypdf import SubprocessOutputError
-
-from documents.parsers import ParseError
-from documents.parsers import run_convert
-from documents.tests.utils import DirectoriesMixin
-from documents.tests.utils import FileSystemAssertsMixin
-from paperless_tesseract.parsers import RasterisedDocumentParser
-from paperless_tesseract.parsers import post_process_text
-
-
-class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
-    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
-
-    def assertContainsStrings(self, content, strings) -> None:
-        # Asserts that all strings appear in content, in the given order.
-        indices = []
-        for s in strings:
-            if s in content:
-                indices.append(content.index(s))
-            else:
-                self.fail(f"'{s}' is not in '{content}'")
-        self.assertListEqual(indices, sorted(indices))
-
-    def test_post_process_text(self) -> None:
-        text_cases = [
-            ("simple     string", "simple string"),
-            ("simple    newline\n   testing string", "simple newline\ntesting string"),
-            (
-                "utf-8   строка с пробелами в конце  ",
-                "utf-8 строка с пробелами в конце",
-            ),
-        ]
-
-        for source, result in text_cases:
-            actual_result = post_process_text(source)
-            self.assertEqual(
-                result,
-                actual_result,
-                f"strip_exceess_whitespace({source}) != '{result}', but '{actual_result}'",
-            )
-
-    def test_get_text_from_pdf(self) -> None:
-        parser = RasterisedDocumentParser(uuid.uuid4())
-        text = parser.extract_text(
-            None,
-            self.SAMPLE_FILES / "simple-digital.pdf",
-        )
-
-        self.assertContainsStrings(text.strip(), ["This is a test document."])
-
-    def test_get_page_count(self) -> None:
-        """
-        GIVEN:
-            - PDF file with a single page
-            - PDF file with multiple pages
-        WHEN:
-            - The number of pages is requested
-        THEN:
-            - The method returns 1 as the expected number of pages
-            - The method returns the correct number of pages (6)
-        """
-        parser = RasterisedDocumentParser(uuid.uuid4())
-        page_count = parser.get_page_count(
-            str(self.SAMPLE_FILES / "simple-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertEqual(page_count, 1)
-
-        page_count = parser.get_page_count(
-            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
-            "application/pdf",
-        )
-        self.assertEqual(page_count, 6)
-
-    def test_get_page_count_password_protected(self) -> None:
-        """
-        GIVEN:
-            - Password protected PDF file
-        WHEN:
-            - The number of pages is requested
-        THEN:
-            - The method returns None
-        """
-        parser = RasterisedDocumentParser(uuid.uuid4())
-        with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
-            page_count = parser.get_page_count(
-                str(self.SAMPLE_FILES / "password-protected.pdf"),
-                "application/pdf",
-            )
-            self.assertEqual(page_count, None)
-            self.assertIn("Unable to determine PDF page count", cm.output[0])
-
-    def test_thumbnail(self) -> None:
-        parser = RasterisedDocumentParser(uuid.uuid4())
-        thumb = parser.get_thumbnail(
-            str(self.SAMPLE_FILES / "simple-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsFile(thumb)
-
-    @mock.patch("documents.parsers.run_convert")
-    def test_thumbnail_fallback(self, m) -> None:
-        def call_convert(input_file, output_file, **kwargs) -> None:
-            if ".pdf" in str(input_file):
-                raise ParseError("Does not compute.")
-            else:
-                run_convert(input_file=input_file, output_file=output_file, **kwargs)
-
-        m.side_effect = call_convert
-
-        parser = RasterisedDocumentParser(uuid.uuid4())
-        thumb = parser.get_thumbnail(
-            str(self.SAMPLE_FILES / "simple-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsFile(thumb)
-
-    def test_thumbnail_encrypted(self) -> None:
-        parser = RasterisedDocumentParser(uuid.uuid4())
-        thumb = parser.get_thumbnail(
-            str(self.SAMPLE_FILES / "encrypted.pdf"),
-            "application/pdf",
-        )
-        self.assertIsFile(thumb)
-
-    def test_get_dpi(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple-no-dpi.png"))
-        self.assertEqual(dpi, None)
-
-        dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple.png"))
-        self.assertEqual(dpi, 72)
-
-    def test_simple_digital(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        parser.parse(
-            str(self.SAMPLE_FILES / "simple-digital.pdf"),
-            "application/pdf",
-        )
-
-        self.assertIsFile(parser.archive_path)
-
-        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
-
-    def test_with_form(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        parser.parse(
-            str(self.SAMPLE_FILES / "with-form.pdf"),
-            "application/pdf",
-        )
-
-        self.assertIsFile(parser.archive_path)
-
-        self.assertContainsStrings(
-            parser.get_text(),
-            ["Please enter your name in here:", "This is a PDF document with a form."],
-        )
-
-    @override_settings(OCR_MODE="redo")
-    def test_with_form_error(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        parser.parse(
-            str(self.SAMPLE_FILES / "with-form.pdf"),
-            "application/pdf",
-        )
-
-        self.assertIsNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text(),
-            ["Please enter your name in here:", "This is a PDF document with a form."],
-        )
-
-    @override_settings(OCR_MODE="skip")
-    def test_signed(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        parser.parse(str(self.SAMPLE_FILES / "signed.pdf"), "application/pdf")
-
-        self.assertIsNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text(),
-            [
-                "This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable",
-                "automated testing of signed/encrypted PDFs",
-            ],
-        )
-
-    @override_settings(OCR_MODE="skip")
-    def test_encrypted(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        parser.parse(
-            str(self.SAMPLE_FILES / "encrypted.pdf"),
-            "application/pdf",
-        )
-
-        self.assertIsNone(parser.archive_path)
-        self.assertEqual(parser.get_text(), "")
-
-    @override_settings(OCR_MODE="redo")
-    def test_with_form_error_notext(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "with-form.pdf"),
-            "application/pdf",
-        )
-
-        self.assertContainsStrings(
-            parser.get_text(),
-            ["Please enter your name in here:", "This is a PDF document with a form."],
-        )
-
-    @override_settings(OCR_MODE="force")
-    def test_with_form_force(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        parser.parse(
-            str(self.SAMPLE_FILES / "with-form.pdf"),
-            "application/pdf",
-        )
-
-        self.assertContainsStrings(
-            parser.get_text(),
-            ["Please enter your name in here:", "This is a PDF document with a form."],
-        )
-
-    def test_image_simple(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        parser.parse(str(self.SAMPLE_FILES / "simple.png"), "image/png")
-
-        self.assertIsFile(parser.archive_path)
-
-        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
-
-    def test_image_simple_alpha(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        with tempfile.TemporaryDirectory() as tempdir:
-            # Copy sample file to temp directory, as the parsing changes the file
-            # and this makes it modified to Git
-            sample_file = self.SAMPLE_FILES / "simple-alpha.png"
-            dest_file = Path(tempdir) / "simple-alpha.png"
-            shutil.copy(sample_file, dest_file)
-
-            parser.parse(str(dest_file), "image/png")
-
-            self.assertIsFile(parser.archive_path)
-
-            self.assertContainsStrings(parser.get_text(), ["This is a test document."])
-
-    def test_image_calc_a4_dpi(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        dpi = parser.calculate_a4_dpi(
-            str(self.SAMPLE_FILES / "simple-no-dpi.png"),
-        )
-
-        self.assertEqual(dpi, 62)
-
-    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
-    def test_image_dpi_fail(self, m) -> None:
-        m.return_value = None
-        parser = RasterisedDocumentParser(None)
-
-        def f() -> None:
-            parser.parse(
-                str(self.SAMPLE_FILES / "simple-no-dpi.png"),
-                "image/png",
-            )
-
-        self.assertRaises(ParseError, f)
-
-    @override_settings(OCR_IMAGE_DPI=72, MAX_IMAGE_PIXELS=0)
-    def test_image_no_dpi_default(self) -> None:
-        parser = RasterisedDocumentParser(None)
-
-        parser.parse(str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png")
-
-        self.assertIsFile(parser.archive_path)
-
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["this is a test document."],
-        )
-
-    def test_multi_page(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsFile(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_PAGES=2, OCR_MODE="skip")
-    def test_multi_page_pages_skip(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsFile(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
-    def test_multi_page_pages_redo(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsFile(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_PAGES=2, OCR_MODE="force")
-    def test_multi_page_pages_force(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsFile(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_MODE="skip")
-    def test_multi_page_analog_pages_skip(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-            "application/pdf",
-        )
-        self.assertIsFile(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
-    def test_multi_page_analog_pages_redo(self) -> None:
-        """
-        GIVEN:
-            - File with text contained in images but no text layer
-            - OCR of only pages 1 and 2 requested
-            - OCR mode set to redo
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text of page 1 and 2 extracted
-            - An archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-            "application/pdf",
-        )
-        self.assertIsFile(parser.archive_path)
-        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
-        self.assertNotIn("page 3", parser.get_text().lower())
-
-    @override_settings(OCR_PAGES=1, OCR_MODE="force")
-    def test_multi_page_analog_pages_force(self) -> None:
-        """
-        GIVEN:
-            - File with text contained in images but no text layer
-            - OCR of only page 1 requested
-            - OCR mode set to force
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Only text of page 1 is extracted
-            - An archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-            "application/pdf",
-        )
-        self.assertIsFile(parser.archive_path)
-        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
-        self.assertNotIn("page 2", parser.get_text().lower())
-        self.assertNotIn("page 3", parser.get_text().lower())
-
-    @override_settings(OCR_MODE="skip_noarchive")
-    def test_skip_noarchive_withtext(self) -> None:
-        """
-        GIVEN:
-            - File with existing text layer
-            - OCR mode set to skip_noarchive
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from images is extracted
-            - No archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_MODE="skip_noarchive")
-    def test_skip_noarchive_notext(self) -> None:
-        """
-        GIVEN:
-            - File with text contained in images but no text layer
-            - OCR mode set to skip_noarchive
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from images is extracted
-            - An archive file is created with the OCRd text
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-            "application/pdf",
-        )
-
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-        self.assertIsNotNone(parser.archive_path)
-
-    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
-    def test_skip_archive_never_withtext(self) -> None:
-        """
-        GIVEN:
-            - File with existing text layer
-            - OCR_SKIP_ARCHIVE_FILE set to never
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from text layer is extracted
-            - Archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNotNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
-    def test_skip_archive_never_withimages(self) -> None:
-        """
-        GIVEN:
-            - File with text contained in images but no text layer
-            - OCR_SKIP_ARCHIVE_FILE set to never
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from images is extracted
-            - Archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNotNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
-    def test_skip_archive_withtext_withtext(self) -> None:
-        """
-        GIVEN:
-            - File with existing text layer
-            - OCR_SKIP_ARCHIVE_FILE set to with_text
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from text layer is extracted
-            - No archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
-    def test_skip_archive_withtext_withimages(self) -> None:
-        """
-        GIVEN:
-            - File with text contained in images but no text layer
-            - OCR_SKIP_ARCHIVE_FILE set to with_text
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from images is extracted
-            - Archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNotNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
-    def test_skip_archive_always_withtext(self) -> None:
-        """
-        GIVEN:
-            - File with existing text layer
-            - OCR_SKIP_ARCHIVE_FILE set to always
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from text layer is extracted
-            - No archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
-    def test_skip_archive_always_withimages(self) -> None:
-        """
-        GIVEN:
-            - File with text contained in images but no text layer
-            - OCR_SKIP_ARCHIVE_FILE set to always
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from images is extracted
-            - No archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_MODE="skip")
-    def test_multi_page_mixed(self) -> None:
-        """
-        GIVEN:
-            - File with some text contained in images and some in text layer
-            - OCR mode set to skip
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from images is extracted
-            - An archive file is created with the OCRd text and the original text
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNotNone(parser.archive_path)
-        self.assertIsFile(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
-        )
-
-        with (parser.tempdir / "sidecar.txt").open() as f:
-            sidecar = f.read()
-
-        self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
-
-    @override_settings(OCR_MODE="redo")
-    def test_single_page_mixed(self) -> None:
-        """
-        GIVEN:
-            - File with some text contained in images and some in text layer
-            - Text and images are mixed on the same page
-            - OCR mode set to redo
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from images is extracted
-            - Full content of the file is parsed (not just the image text)
-            - An archive file is created with the OCRd text and the original text
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "single-page-mixed.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNotNone(parser.archive_path)
-        self.assertIsFile(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            [
-                "this is some normal text, present on page 1 of the document.",
-                "this is some text, but in an image, also on page 1.",
-                "this is further text on page 1.",
-            ],
-        )
-
-        with (parser.tempdir / "sidecar.txt").open() as f:
-            sidecar = f.read().lower()
-
-        self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
-        self.assertNotIn(
-            "this is some normal text, present on page 1 of the document.",
-            sidecar,
-        )
-
-    @override_settings(OCR_MODE="skip_noarchive")
-    def test_multi_page_mixed_no_archive(self) -> None:
-        """
-        GIVEN:
-            - File with some text contained in images and some in text layer
-            - OCR mode set to skip_noarchive
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from images is extracted
-            - No archive file is created as original file contains text
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 4", "page 5", "page 6"],
-        )
-
-    @override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
-    def test_rotate(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(str(self.SAMPLE_FILES / "rotated.pdf"), "application/pdf")
-        self.assertContainsStrings(
-            parser.get_text(),
-            [
-                "This is the text that appears on the first page. It’s a lot of text.",
-                "Even if the pages are rotated, OCRmyPDF still gets the job done.",
-                "This is a really weird file with lots of nonsense text.",
-                "If you read this, it’s your own fault. Also check your screen orientation.",
-            ],
-        )
-
-    def test_multi_page_tiff(self) -> None:
-        """
-        GIVEN:
-            - Multi-page TIFF image
-        WHEN:
-            - Image is parsed
-        THEN:
-            - Text from all pages extracted
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "multi-page-images.tiff"),
-            "image/tiff",
-        )
-        self.assertIsFile(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    def test_multi_page_tiff_alpha(self) -> None:
-        """
-        GIVEN:
-            - Multi-page TIFF image
-            - Image include an alpha channel
-        WHEN:
-            - Image is parsed
-        THEN:
-            - Text from all pages extracted
-        """
-        parser = RasterisedDocumentParser(None)
-        sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff"
-        with tempfile.NamedTemporaryFile() as tmp_file:
-            shutil.copy(sample_file, tmp_file.name)
-            parser.parse(
-                tmp_file.name,
-                "image/tiff",
-            )
-            self.assertIsFile(parser.archive_path)
-            self.assertContainsStrings(
-                parser.get_text().lower(),
-                ["page 1", "page 2", "page 3"],
-            )
-
-    def test_multi_page_tiff_alpha_srgb(self) -> None:
-        """
-        GIVEN:
-            - Multi-page TIFF image
-            - Image include an alpha channel
-            - Image is srgb colorspace
-        WHEN:
-            - Image is parsed
-        THEN:
-            - Text from all pages extracted
-        """
-        parser = RasterisedDocumentParser(None)
-        sample_file = str(
-            self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff",
-        )
-        with tempfile.NamedTemporaryFile() as tmp_file:
-            shutil.copy(sample_file, tmp_file.name)
-            parser.parse(
-                tmp_file.name,
-                "image/tiff",
-            )
-            self.assertIsFile(parser.archive_path)
-            self.assertContainsStrings(
-                parser.get_text().lower(),
-                ["page 1", "page 2", "page 3"],
-            )
-
-    def test_ocrmypdf_parameters(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        params = parser.construct_ocrmypdf_parameters(
-            input_file="input.pdf",
-            output_file="output.pdf",
-            sidecar_file="sidecar.txt",
-            mime_type="application/pdf",
-            safe_fallback=False,
-        )
-
-        self.assertEqual(params["input_file_or_options"], "input.pdf")
-        self.assertEqual(params["output_file"], "output.pdf")
-        self.assertEqual(params["sidecar"], "sidecar.txt")
-
-        with override_settings(OCR_CLEAN="none"):
-            parser = RasterisedDocumentParser(None)
-            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-            self.assertNotIn("clean", params)
-            self.assertNotIn("clean_final", params)
-
-        with override_settings(OCR_CLEAN="clean"):
-            parser = RasterisedDocumentParser(None)
-            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-            self.assertTrue(params["clean"])
-            self.assertNotIn("clean_final", params)
-
-        with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
-            parser = RasterisedDocumentParser(None)
-            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-            self.assertTrue(params["clean_final"])
-            self.assertNotIn("clean", params)
-
-        with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
-            parser = RasterisedDocumentParser(None)
-            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-            self.assertTrue(params["clean"])
-            self.assertNotIn("clean_final", params)
-
-        with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
-            parser = RasterisedDocumentParser(None)
-            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-            self.assertTrue(params["deskew"])
-
-        with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
-            parser = RasterisedDocumentParser(None)
-            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-            self.assertNotIn("deskew", params)
-
-        with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
-            parser = RasterisedDocumentParser(None)
-            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-            self.assertNotIn("deskew", params)
-
-        with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
-            parser = RasterisedDocumentParser(None)
-            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-            self.assertIn("max_image_mpixels", params)
-            self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
-
-        with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
-            parser = RasterisedDocumentParser(None)
-            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-            self.assertNotIn("max_image_mpixels", params)
-
-    def test_rtl_language_detection(self) -> None:
-        """
-        GIVEN:
-            - File with text in an RTL language
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from the document is extracted
-        """
-        parser = RasterisedDocumentParser(None)
-
-        parser.parse(
-            str(self.SAMPLE_FILES / "rtl-test.pdf"),
-            "application/pdf",
-        )
-
-        # OCR output for RTL text varies across platforms/versions due to
-        # bidi controls and presentation forms; normalize before assertion.
-        normalized_text = "".join(
-            char
-            for char in unicodedata.normalize("NFKC", parser.get_text())
-            if unicodedata.category(char) != "Cf" and not char.isspace()
-        )
-
-        self.assertIn("ةرازو", normalized_text)
-        self.assertTrue(
-            any(token in normalized_text for token in ("ةیلخادلا", "الاخليد")),
-        )
-
-    @mock.patch("ocrmypdf.ocr")
-    def test_gs_rendering_error(self, m) -> None:
-        m.side_effect = SubprocessOutputError("Ghostscript PDF/A rendering failed")
-        parser = RasterisedDocumentParser(None)
-
-        self.assertRaises(
-            ParseError,
-            parser.parse,
-            str(self.SAMPLE_FILES / "simple-digital.pdf"),
-            "application/pdf",
-        )
-
-
-class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
-    SAMPLE_FILES = Path(__file__).parent / "samples"
-
-    def test_bmp(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(str(self.SAMPLE_FILES / "simple.bmp"), "image/bmp")
-        self.assertIsFile(parser.archive_path)
-        self.assertIn("this is a test document", parser.get_text().lower())
-
-    def test_jpg(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(str(self.SAMPLE_FILES / "simple.jpg"), "image/jpeg")
-        self.assertIsFile(parser.archive_path)
-        self.assertIn("this is a test document", parser.get_text().lower())
-
-    def test_heic(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(str(self.SAMPLE_FILES / "simple.heic"), "image/heic")
-        self.assertIsFile(parser.archive_path)
-        self.assertIn("pizza", parser.get_text().lower())
-
-    @override_settings(OCR_IMAGE_DPI=200)
-    def test_gif(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(str(self.SAMPLE_FILES / "simple.gif"), "image/gif")
-        self.assertIsFile(parser.archive_path)
-        self.assertIn("this is a test document", parser.get_text().lower())
-
-    def test_tiff(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(str(self.SAMPLE_FILES / "simple.tif"), "image/tiff")
-        self.assertIsFile(parser.archive_path)
-        self.assertIn("this is a test document", parser.get_text().lower())
-
-    @override_settings(OCR_IMAGE_DPI=72)
-    def test_webp(self) -> None:
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            str(self.SAMPLE_FILES / "document.webp"),
-            "image/webp",
-        )
-        self.assertIsFile(parser.archive_path)
-        # Older tesseracts consistently mangle the space between "a webp",
-        # tesseract 5.3.0 seems to do a better job, so we're accepting both
-        self.assertRegex(
-            parser.get_text().lower(),
-            r"this is a ?webp document, created 11/14/2022.",
-        )
--- a/src/paperless_text/init.py
+++ b/src/paperless_text/init.py
--- a/src/paperless_text/apps.py
+++ b/src/paperless_text/apps.py
@@ -1,14 +0,0 @@
-from django.apps import AppConfig
-
-from paperless_text.signals import text_consumer_declaration
-
-
-class PaperlessTextConfig(AppConfig):
-    name = "paperless_text"
-
-    def ready(self) -> None:
-        from documents.signals import document_consumer_declaration
-
-        document_consumer_declaration.connect(text_consumer_declaration)
-
-        AppConfig.ready(self)
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -1,24 +0,0 @@
-def get_parser(*args, **kwargs):
-    from paperless.parsers.text import TextDocumentParser
-
-    # TextDocumentParser accepts logging_group for constructor compatibility but
-    # does not store or use it (no legacy DocumentParser base class).
-    # progress_callback is also not used.  Both may arrive as a positional arg
-    # (consumer) or a keyword arg (views); *args absorbs the positional form,
-    # kwargs.pop handles the keyword form.  Phase 4 will replace this signal
-    # path with the new ParserRegistry so the shim can be removed at that point.
-    kwargs.pop("logging_group", None)
-    kwargs.pop("progress_callback", None)
-    return TextDocumentParser()
-
-
-def text_consumer_declaration(sender, **kwargs):
-    return {
-        "parser": get_parser,
-        "weight": 10,
-        "mime_types": {
-            "text/plain": ".txt",
-            "text/csv": ".csv",
-            "application/csv": ".csv",
-        },
-    }
--- a/src/paperless_text/tests/init.py
+++ b/src/paperless_text/tests/init.py
--- a/src/paperless_tika/init.py
+++ b/src/paperless_tika/init.py
--- a/src/paperless_tika/apps.py
+++ b/src/paperless_tika/apps.py
@@ -1,15 +0,0 @@
-from django.apps import AppConfig
-from django.conf import settings
-
-from paperless_tika.signals import tika_consumer_declaration
-
-
-class PaperlessTikaConfig(AppConfig):
-    name = "paperless_tika"
-
-    def ready(self) -> None:
-        from documents.signals import document_consumer_declaration
-
-        if settings.TIKA_ENABLED:
-            document_consumer_declaration.connect(tika_consumer_declaration)
-        AppConfig.ready(self)
--- a/Show More
+++ b/Show More