Feature: migrate RemoteDocumentParser to ParserProtocol interface

Rewrites the remote OCR parser to the new plugin system contract: - `supported_mime_types()` is now a classmethod that always returns the full set of 7 MIME types; the old instance-method hack (returning {} when unconfigured) is removed - `score()` classmethod returns None when no remote engine is configured (making the parser invisible to the registry), and 20 when active — higher than the tesseract default of 10 so the remote engine takes priority when both are available - No longer inherits from RasterisedDocumentParser; inherits no parser class at all — just implements the protocol directly - `can_produce_archive = True`; `requires_pdf_rendition = False` - `_azure_ai_vision_parse()` takes explicit config arg; API client created and closed within the method - `get_page_count()` returns the PDF page count for application/pdf, delegating to the new `get_page_count_for_pdf()` utility - `extract_metadata()` delegates to `extract_pdf_metadata()` for PDFs; returns [] for all other MIME types New files: - `src/paperless/parsers/utils.py` — shared `extract_pdf_metadata()` and `get_page_count_for_pdf()` utilities (pikepdf-based); both the remote and tesseract parsers will use these going forward - `src/paperless/tests/parsers/test_remote_parser.py` — 42 pytest-style tests using pytest-django `settings` and pytest-mock `mocker` fixtures - `src/paperless/tests/parsers/conftest.py` — remote parser instance, sample-file, and settings-helper fixtures Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 09:55:29 +00:00 · 2026-03-13 11:52:11 -07:00
parent 75dce7f19f
commit 5d4d87764c
5 changed files with 1058 additions and 164 deletions
@@ -193,9 +193,11 @@ class ParserRegistry:
        that log output is predictable; scoring determines which parser wins
        at runtime regardless of registration order.
        """
+        from paperless.parsers.remote import RemoteDocumentParser
        from paperless.parsers.text import TextDocumentParser

        self.register_builtin(TextDocumentParser)
+        self.register_builtin(RemoteDocumentParser)

    # ------------------------------------------------------------------
    # Discovery
@@ -1,70 +1,381 @@
+"""
+Built-in remote-OCR document parser.
+
+Handles documents by sending them to a configured remote OCR engine
+(currently Azure AI Vision / Document Intelligence) and retrieving both
+the extracted text and a searchable PDF with an embedded text layer.
+
+When no engine is configured, ``score()`` returns ``None`` so the parser
+is effectively invisible to the registry — the tesseract parser handles
+these MIME types instead.
+"""
+
+from __future__ import annotations
+
+import logging
+import shutil
+import tempfile
 from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Self

 from django.conf import settings

-from paperless_tesseract.parsers import RasterisedDocumentParser
+from paperless.version import __full_version_str__
+
+if TYPE_CHECKING:
+    import datetime
+    from types import TracebackType
+
+    from paperless.parsers import MetadataEntry
+
+logger = logging.getLogger("paperless.parsing.remote")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+    "application/pdf": ".pdf",
+    "image/png": ".png",
+    "image/jpeg": ".jpg",
+    "image/tiff": ".tiff",
+    "image/bmp": ".bmp",
+    "image/gif": ".gif",
+    "image/webp": ".webp",
+}


 class RemoteEngineConfig:
+    """Holds and validates the remote OCR engine configuration."""
+
    def __init__(
        self,
-        engine: str,
+        engine: str | None,
        api_key: str | None = None,
        endpoint: str | None = None,
-    ):
+    ) -> None:
        self.engine = engine
        self.api_key = api_key
        self.endpoint = endpoint

-    def engine_is_valid(self):
-        valid = self.engine in ["azureai"] and self.api_key is not None
-        if self.engine == "azureai":
-            valid = valid and self.endpoint is not None
-        return valid
+    def engine_is_valid(self) -> bool:
+        """Return True when the engine is known and fully configured."""
+        return (
+            self.engine in ("azureai",)
+            and self.api_key is not None
+            and not (self.engine == "azureai" and self.endpoint is None)
+        )


-class RemoteDocumentParser(RasterisedDocumentParser):
-    """
-    This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
-    as this is the only service that provides a remote OCR API with text-embedded PDF output.
+class RemoteDocumentParser:
+    """Parse documents via a remote OCR API (currently Azure AI Vision).
+
+    This parser sends documents to a remote engine that returns both
+    extracted text and a searchable PDF with an embedded text layer.
+    It does not depend on Tesseract or ocrmypdf.
+
+    Class attributes
+    ----------------
+    name : str
+        Human-readable parser name.
+    version : str
+        Semantic version string, kept in sync with Paperless-ngx releases.
+    author : str
+        Maintainer name.
+    url : str
+        Issue tracker / source URL.
    """

-    logging_name = "paperless.parsing.remote"
+    name: str = "Paperless-ngx Remote OCR Parser"
+    version: str = __full_version_str__
+    author: str = "Paperless-ngx Contributors"
+    url: str = "https://github.com/paperless-ngx/paperless-ngx"

-    def get_settings(self) -> RemoteEngineConfig:
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        """Return the MIME types this parser can handle.
+
+        The full set is always returned regardless of whether a remote
+        engine is configured.  The ``score()`` method handles the
+        "am I active?" logic by returning ``None`` when not configured.
+
+        Returns
+        -------
+        dict[str, str]
+            Mapping of MIME type to preferred file extension.
        """
-        Returns the configuration for the remote OCR engine, loaded from Django settings.
+        return _SUPPORTED_MIME_TYPES
+
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        """Return the priority score for handling this file, or None.
+
+        Returns ``None`` when no valid remote engine is configured,
+        making the parser invisible to the registry for this file.
+        When configured, returns 20 — higher than the Tesseract parser's
+        default of 10 — so the remote engine takes priority.
+
+        Parameters
+        ----------
+        mime_type:
+            Detected MIME type of the file.
+        filename:
+            Original filename including extension.
+        path:
+            Optional filesystem path. Not inspected by this parser.
+
+        Returns
+        -------
+        int | None
+            20 when the remote engine is configured and the MIME type is
+            supported, otherwise None.
        """
-        return RemoteEngineConfig(
+        config = RemoteEngineConfig(
+            engine=settings.REMOTE_OCR_ENGINE,
+            api_key=settings.REMOTE_OCR_API_KEY,
+            endpoint=settings.REMOTE_OCR_ENDPOINT,
+        )
+        if not config.engine_is_valid():
+            return None
+        if mime_type not in _SUPPORTED_MIME_TYPES:
+            return None
+        return 20
+
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def can_produce_archive(self) -> bool:
+        """Whether this parser can produce a searchable PDF archive copy.
+
+        Returns
+        -------
+        bool
+            Always True — the remote engine always returns a PDF with an
+            embedded text layer that serves as the archive copy.
+        """
+        return True
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        """Whether the parser must produce a PDF for the frontend to display.
+
+        Returns
+        -------
+        bool
+            Always False — all supported originals are displayable by
+            the browser (PDF) or handled via the archive copy (images).
+        """
+        return False
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def __init__(self, logging_group: object = None) -> None:
+        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+        self._tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
+        self._logging_group = logging_group
+        self._text: str | None = None
+        self._archive_path: Path | None = None
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        logger.debug("Cleaning up temporary directory %s", self._tempdir)
+        shutil.rmtree(self._tempdir, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
+        """Send the document to the remote engine and store results.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the document file to parse.
+        mime_type:
+            Detected MIME type of the document.
+        produce_archive:
+            Ignored — the remote engine always returns a searchable PDF,
+            which is stored as the archive copy regardless of this flag.
+        """
+        config = RemoteEngineConfig(
            engine=settings.REMOTE_OCR_ENGINE,
            api_key=settings.REMOTE_OCR_API_KEY,
            endpoint=settings.REMOTE_OCR_ENDPOINT,
        )

-    def supported_mime_types(self):
-        if self.settings.engine_is_valid():
-            return {
-                "application/pdf": ".pdf",
-                "image/png": ".png",
-                "image/jpeg": ".jpg",
-                "image/tiff": ".tiff",
-                "image/bmp": ".bmp",
-                "image/gif": ".gif",
-                "image/webp": ".webp",
-            }
-        else:
-            return {}
+        if not config.engine_is_valid():
+            logger.warning(
+                "No valid remote parser engine is configured, content will be empty.",
+            )
+            self._text = ""
+            return

-    def azure_ai_vision_parse(
+        if config.engine == "azureai":
+            self._text = self._azure_ai_vision_parse(document_path, config)
+
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
+
+    def get_text(self) -> str | None:
+        """Return the plain-text content extracted during parse."""
+        return self._text
+
+    def get_date(self) -> datetime.datetime | None:
+        """Return the document date detected during parse.
+
+        Returns
+        -------
+        datetime.datetime | None
+            Always None — the remote parser does not detect dates.
+        """
+        return None
+
+    def get_archive_path(self) -> Path | None:
+        """Return the path to the generated archive PDF, or None."""
+        return self._archive_path
+
+    # ------------------------------------------------------------------
+    # Thumbnail and metadata
+    # ------------------------------------------------------------------
+
+    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
+        """Generate a thumbnail image for the document.
+
+        Uses the archive PDF produced by the remote engine when available,
+        otherwise falls back to the original document path (PDF inputs).
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        Path
+            Path to the generated WebP thumbnail inside the temp directory.
+        """
+        # make_thumbnail_from_pdf lives in documents.parsers for now;
+        # it will move to paperless.parsers.utils when the tesseract
+        # parser is migrated in a later phase.
+        from documents.parsers import make_thumbnail_from_pdf
+
+        return make_thumbnail_from_pdf(
+            self._archive_path or document_path,
+            self._tempdir,
+            self._logging_group,
+        )
+
+    def get_page_count(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> int | None:
+        """Return the number of pages in a PDF document.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        int | None
+            Page count for PDF inputs, or ``None`` for other MIME types.
+        """
+        if mime_type != "application/pdf":
+            return None
+
+        from paperless.parsers.utils import get_page_count_for_pdf
+
+        return get_page_count_for_pdf(document_path, log=logger)
+
+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract format-specific metadata from the document.
+
+        Delegates to the shared pikepdf-based extractor for PDF files.
+        Returns ``[]`` for all other MIME types.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the file to extract metadata from.
+        mime_type:
+            MIME type of the file.  May be ``"application/pdf"`` when
+            called for the archive version of an image original.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            Zero or more metadata entries.
+        """
+        if mime_type != "application/pdf":
+            return []
+
+        from paperless.parsers.utils import extract_pdf_metadata
+
+        return extract_pdf_metadata(document_path, log=logger)
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    def _azure_ai_vision_parse(
        self,
        file: Path,
+        config: RemoteEngineConfig,
    ) -> str | None:
-        """
-        Uses Azure AI Vision to parse the document and return the text content.
-        It requests a searchable PDF output with embedded text.
-        The PDF is saved to the archive_path attribute.
-        Returns the text content extracted from the document.
-        If the parsing fails, it returns None.
+        """Send ``file`` to Azure AI Document Intelligence and return text.
+
+        Downloads the searchable PDF output from Azure and stores it at
+        ``self._archive_path``.  Returns the extracted text content, or
+        ``None`` on failure (the error is logged).
+
+        Parameters
+        ----------
+        file:
+            Absolute path to the document to analyse.
+        config:
+            Validated remote engine configuration.
+
+        Returns
+        -------
+        str | None
+            Extracted text, or None if the Azure call failed.
        """
        from azure.ai.documentintelligence import DocumentIntelligenceClient
        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
@@ -73,8 +384,8 @@ class RemoteDocumentParser(RasterisedDocumentParser):
        from azure.core.credentials import AzureKeyCredential

        client = DocumentIntelligenceClient(
-            endpoint=self.settings.endpoint,
-            credential=AzureKeyCredential(self.settings.api_key),
+            endpoint=config.endpoint,
+            credential=AzureKeyCredential(config.api_key),
        )

        try:
@@ -84,7 +395,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
                    model_id="prebuilt-read",
                    body=analyze_request,
                    output_content_format=DocumentContentFormat.TEXT,
-                    output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
+                    output=[AnalyzeOutputOption.PDF],
                    content_type="application/json",
                )

@@ -92,27 +403,20 @@ class RemoteDocumentParser(RasterisedDocumentParser):
            result_id = poller.details["operation_id"]
            result = poller.result()

-            # Download the PDF with embedded text
-            self.archive_path = self.tempdir / "archive.pdf"
-            with self.archive_path.open("wb") as f:
+            self._archive_path = self._tempdir / "archive.pdf"
+            with self._archive_path.open("wb") as f:
                for chunk in client.get_analyze_result_pdf(
                    model_id="prebuilt-read",
                    result_id=result_id,
                ):
                    f.write(chunk)
+
            return result.content
+
        except Exception as e:
-            self.log.error(f"Azure AI Vision parsing failed: {e}")
+            logger.error("Azure AI Vision parsing failed: %s", e)
+
        finally:
            client.close()

        return None
-
-    def parse(self, document_path: Path, mime_type, file_name=None):
-        if not self.settings.engine_is_valid():
-            self.log.warning(
-                "No valid remote parser engine is configured, content will be empty.",
-            )
-            self.text = ""
-        elif self.settings.engine == "azureai":
-            self.text = self.azure_ai_vision_parse(document_path)
@@ -0,0 +1,130 @@
+"""
+Shared utilities for Paperless-ngx document parsers.
+
+Functions here are format-neutral helpers that multiple parsers need.
+Keeping them here avoids parsers inheriting from each other just to
+share implementation.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from paperless.parsers import MetadataEntry
+
+logger = logging.getLogger("paperless.parsers.utils")
+
+
+def get_page_count_for_pdf(
+    document_path: Path,
+    log: logging.Logger | None = None,
+) -> int | None:
+    """Return the number of pages in a PDF file using pikepdf.
+
+    Parameters
+    ----------
+    document_path:
+        Absolute path to the PDF file.
+    log:
+        Logger to use for warnings.  Falls back to the module-level logger
+        when omitted.
+
+    Returns
+    -------
+    int | None
+        Page count, or ``None`` if the file cannot be opened or is not a
+        valid PDF.
+    """
+    import pikepdf
+
+    _log = log or logger
+
+    try:
+        with pikepdf.Pdf.open(document_path) as pdf:
+            return len(pdf.pages)
+    except Exception as e:
+        _log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
+        return None
+
+
+def extract_pdf_metadata(
+    document_path: Path,
+    log: logging.Logger | None = None,
+) -> list[MetadataEntry]:
+    """Extract XMP/PDF metadata from a PDF file using pikepdf.
+
+    Reads all XMP metadata entries from the document and returns them as a
+    list of ``MetadataEntry`` dicts.  The method never raises — any failure
+    to open the file or read a specific key is logged and skipped.
+
+    Parameters
+    ----------
+    document_path:
+        Absolute path to the PDF file.
+    log:
+        Logger to use for warnings and debug messages.  Falls back to the
+        module-level logger when omitted.
+
+    Returns
+    -------
+    list[MetadataEntry]
+        Zero or more metadata entries.  Returns ``[]`` if the file cannot
+        be opened or contains no readable XMP metadata.
+    """
+    import pikepdf
+
+    from paperless.parsers import MetadataEntry
+
+    _log = log or logger
+    result: list[MetadataEntry] = []
+    namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+
+    try:
+        pdf = pikepdf.open(document_path)
+        meta = pdf.open_metadata()
+    except Exception as e:
+        _log.warning("Could not open PDF metadata for %s: %s", document_path, e)
+        return []
+
+    for key, value in meta.items():
+        if isinstance(value, list):
+            value = " ".join(str(e) for e in value)
+        value = str(value)
+
+        try:
+            m = namespace_pattern.match(key)
+            if m is None:
+                continue
+
+            namespace = m.group(1)
+            key_value = m.group(2)
+
+            try:
+                namespace.encode("utf-8")
+                key_value.encode("utf-8")
+            except UnicodeEncodeError as enc_err:
+                _log.debug("Skipping metadata key %s: %s", key, enc_err)
+                continue
+
+            result.append(
+                MetadataEntry(
+                    namespace=namespace,
+                    prefix=meta.REVERSE_NS[namespace],
+                    key=key_value,
+                    value=value,
+                ),
+            )
+        except Exception as e:
+            _log.warning(
+                "Error reading metadata key %s value %s: %s",
+                key,
+                value,
+                e,
+            )
+
+    return result
@@ -10,12 +10,15 @@ from typing import TYPE_CHECKING

 import pytest

+from paperless.parsers.remote import RemoteDocumentParser
 from paperless.parsers.text import TextDocumentParser

 if TYPE_CHECKING:
    from collections.abc import Generator
    from pathlib import Path

+    from pytest_django.fixtures import SettingsWrapper
+

 # ------------------------------------------------------------------
 # Text parser sample files
@@ -74,3 +77,89 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
    """
    with TextDocumentParser() as parser:
        yield parser
+
+
+# ------------------------------------------------------------------
+# Remote parser sample files
+# ------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def remote_samples_dir(samples_dir: Path) -> Path:
+    """Absolute path to the remote parser sample files directory.
+
+    Returns
+    -------
+    Path
+        ``<samples_dir>/remote/``
+    """
+    return samples_dir / "remote"
+
+
+@pytest.fixture(scope="session")
+def sample_pdf_file(remote_samples_dir: Path) -> Path:
+    """Path to a simple digital PDF sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``remote/simple-digital.pdf``.
+    """
+    return remote_samples_dir / "simple-digital.pdf"
+
+
+# ------------------------------------------------------------------
+# Remote parser instance
+# ------------------------------------------------------------------
+
+
+@pytest.fixture()
+def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
+    """Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
+
+    Yields
+    ------
+    RemoteDocumentParser
+        A ready-to-use parser instance.
+    """
+    with RemoteDocumentParser() as parser:
+        yield parser
+
+
+# ------------------------------------------------------------------
+# Remote parser settings helpers
+# ------------------------------------------------------------------
+
+
+@pytest.fixture()
+def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
+    """Configure Django settings for a valid Azure AI OCR engine.
+
+    Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
+    ``REMOTE_OCR_ENDPOINT`` to test values.  Settings are restored
+    automatically after the test by pytest-django.
+
+    Returns
+    -------
+    SettingsWrapper
+        The modified settings object (for chaining further overrides).
+    """
+    settings.REMOTE_OCR_ENGINE = "azureai"
+    settings.REMOTE_OCR_API_KEY = "test-api-key"
+    settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
+    return settings
+
+
+@pytest.fixture()
+def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
+    """Configure Django settings with no remote engine configured.
+
+    Returns
+    -------
+    SettingsWrapper
+        The modified settings object.
+    """
+    settings.REMOTE_OCR_ENGINE = None
+    settings.REMOTE_OCR_API_KEY = None
+    settings.REMOTE_OCR_ENDPOINT = None
+    return settings
@@ -1,131 +1,500 @@
-import uuid
-from pathlib import Path
-from unittest import mock
+"""
+Tests for paperless.parsers.remote.RemoteDocumentParser.

-from django.test import TestCase
-from django.test import override_settings
-from paperless_remote.parsers import RemoteDocumentParser
+All tests use the context-manager protocol for parser lifecycle.  The Azure
+AI client is always mocked via the ``mocker`` fixture so no real network
+calls are made.  Django settings are overridden via the pytest-django
+``settings`` fixture (or the ``azure_settings`` / ``no_engine_settings``
+helpers defined in conftest.py).
+"""

-from documents.tests.utils import DirectoriesMixin
-from documents.tests.utils import FileSystemAssertsMixin
-from paperless_remote.signals import get_parser
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from unittest.mock import Mock
+
+import pytest
+
+from paperless.parsers import ParserProtocol
+from paperless.parsers.remote import RemoteDocumentParser
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from pytest_django.fixtures import SettingsWrapper
+    from pytest_mock import MockerFixture


-class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
-    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------

-    def assertContainsStrings(self, content: str, strings: list[str]) -> None:
-        # Asserts that all strings appear in content, in the given order.
-        indices = []
-        for s in strings:
-            if s in content:
-                indices.append(content.index(s))
-            else:
-                self.fail(f"'{s}' is not in '{content}'")
-        self.assertListEqual(indices, sorted(indices))

-    @mock.patch("paperless_tesseract.parsers.run_subprocess")
-    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
-    def test_get_text_with_azure(self, mock_client_cls, mock_subprocess) -> None:
-        # Arrange mock Azure client
-        mock_client = mock.Mock()
-        mock_client_cls.return_value = mock_client
+def _make_azure_mock(text: str = "Extracted text.") -> Mock:
+    """Return a configured mock Azure DocumentIntelligenceClient."""
+    mock_client = Mock()
+    mock_poller = Mock()
+    mock_poller.wait.return_value = None
+    mock_poller.details = {"operation_id": "fake-op-id"}
+    mock_poller.result.return_value.content = text
+    mock_client.begin_analyze_document.return_value = mock_poller
+    mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
+    return mock_client

-        # Simulate poller result and its `.details`
-        mock_poller = mock.Mock()
-        mock_poller.wait.return_value = None
-        mock_poller.details = {"operation_id": "fake-op-id"}
-        mock_client.begin_analyze_document.return_value = mock_poller
-        mock_poller.result.return_value.content = "This is a test document."

-        # Return dummy PDF bytes
-        mock_client.get_analyze_result_pdf.return_value = [
-            b"%PDF-",
-            b"1.7 ",
-            b"FAKEPDF",
-        ]
+# ---------------------------------------------------------------------------
+# Protocol contract
+# ---------------------------------------------------------------------------

-        # Simulate pdftotext by writing dummy text to sidecar file
-        def fake_run(cmd, *args, **kwargs) -> None:
-            with Path(cmd[-1]).open("w", encoding="utf-8") as f:
-                f.write("This is a test document.")

-        mock_subprocess.side_effect = fake_run
+class TestRemoteParserProtocol:
+    """Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""

-        with override_settings(
-            REMOTE_OCR_ENGINE="azureai",
-            REMOTE_OCR_API_KEY="somekey",
-            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
-        ):
-            parser = get_parser(uuid.uuid4())
-            parser.parse(
-                self.SAMPLE_FILES / "simple-digital.pdf",
-                "application/pdf",
-            )
-
-            self.assertContainsStrings(
-                parser.text.strip(),
-                ["This is a test document."],
-            )
-
-    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
-    def test_get_text_with_azure_error_logged_and_returns_none(
+    def test_isinstance_satisfies_protocol(
        self,
-        mock_client_cls,
+        remote_parser: RemoteDocumentParser,
    ) -> None:
-        mock_client = mock.Mock()
-        mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
-        mock_client_cls.return_value = mock_client
+        assert isinstance(remote_parser, ParserProtocol)

-        with override_settings(
-            REMOTE_OCR_ENGINE="azureai",
-            REMOTE_OCR_API_KEY="somekey",
-            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
-        ):
-            parser = get_parser(uuid.uuid4())
-            with mock.patch.object(parser.log, "error") as mock_log_error:
-                parser.parse(
-                    self.SAMPLE_FILES / "simple-digital.pdf",
-                    "application/pdf",
-                )
+    def test_class_attributes_present(self) -> None:
+        assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
+        assert (
+            isinstance(RemoteDocumentParser.version, str)
+            and RemoteDocumentParser.version
+        )
+        assert (
+            isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
+        )
+        assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url

-        self.assertIsNone(parser.text)
-        mock_client.begin_analyze_document.assert_called_once()
-        mock_client.close.assert_called_once()
-        mock_log_error.assert_called_once()
-        self.assertIn(
-            "Azure AI Vision parsing failed",
-            mock_log_error.call_args[0][0],
+
+# ---------------------------------------------------------------------------
+# supported_mime_types
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserSupportedMimeTypes:
+    """supported_mime_types() always returns the full set regardless of config."""
+
+    def test_returns_dict(self) -> None:
+        mime_types = RemoteDocumentParser.supported_mime_types()
+        assert isinstance(mime_types, dict)
+
+    def test_includes_all_expected_types(self) -> None:
+        mime_types = RemoteDocumentParser.supported_mime_types()
+        expected = {
+            "application/pdf",
+            "image/png",
+            "image/jpeg",
+            "image/tiff",
+            "image/bmp",
+            "image/gif",
+            "image/webp",
+        }
+        assert expected == set(mime_types.keys())
+
+    def test_returns_full_set_when_not_configured(
+        self,
+        no_engine_settings: SettingsWrapper,
+    ) -> None:
+        """
+        GIVEN: No remote engine is configured
+        WHEN:  supported_mime_types() is called
+        THEN:  The full MIME type dict is still returned (score() handles activation)
+        """
+        mime_types = RemoteDocumentParser.supported_mime_types()
+        assert len(mime_types) == 7
+
+
+# ---------------------------------------------------------------------------
+# score()
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserScore:
+    """score() encodes the activation logic: None when unconfigured, 20 when active."""
+
+    @pytest.mark.parametrize(
+        "mime_type",
+        [
+            "application/pdf",
+            "image/png",
+            "image/jpeg",
+            "image/tiff",
+            "image/bmp",
+            "image/gif",
+            "image/webp",
+        ],
+    )
+    def test_score_returns_20_when_configured(
+        self,
+        azure_settings: SettingsWrapper,
+        mime_type: str,
+    ) -> None:
+        result = RemoteDocumentParser.score(mime_type, "doc.pdf")
+        assert result == 20
+
+    @pytest.mark.parametrize(
+        "mime_type",
+        ["application/pdf", "image/png", "image/jpeg"],
+    )
+    def test_score_returns_none_when_no_engine(
+        self,
+        no_engine_settings: SettingsWrapper,
+        mime_type: str,
+    ) -> None:
+        result = RemoteDocumentParser.score(mime_type, "doc.pdf")
+        assert result is None
+
+    def test_score_returns_none_when_api_key_missing(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.REMOTE_OCR_ENGINE = "azureai"
+        settings.REMOTE_OCR_API_KEY = None
+        settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
+        result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
+        assert result is None
+
+    def test_score_returns_none_when_endpoint_missing(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.REMOTE_OCR_ENGINE = "azureai"
+        settings.REMOTE_OCR_API_KEY = "key"
+        settings.REMOTE_OCR_ENDPOINT = None
+        result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
+        assert result is None
+
+    def test_score_returns_none_for_unsupported_mime_type(
+        self,
+        azure_settings: SettingsWrapper,
+    ) -> None:
+        result = RemoteDocumentParser.score("text/plain", "doc.txt")
+        assert result is None
+
+    def test_score_higher_than_tesseract_default(
+        self,
+        azure_settings: SettingsWrapper,
+    ) -> None:
+        """Remote parser (20) outranks the tesseract default (10) when configured."""
+        score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
+        assert score is not None and score > 10
+
+
+# ---------------------------------------------------------------------------
+# Properties
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserProperties:
+    def test_can_produce_archive_is_true(
+        self,
+        remote_parser: RemoteDocumentParser,
+    ) -> None:
+        assert remote_parser.can_produce_archive is True
+
+    def test_requires_pdf_rendition_is_false(
+        self,
+        remote_parser: RemoteDocumentParser,
+    ) -> None:
+        assert remote_parser.requires_pdf_rendition is False
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserLifecycle:
+    def test_context_manager_cleans_up_tempdir(self) -> None:
+        with RemoteDocumentParser() as parser:
+            tempdir = parser._tempdir
+            assert tempdir.exists()
+        assert not tempdir.exists()
+
+    def test_context_manager_cleans_up_after_exception(self) -> None:
+        tempdir: Path | None = None
+        with pytest.raises(RuntimeError):
+            with RemoteDocumentParser() as parser:
+                tempdir = parser._tempdir
+                raise RuntimeError("boom")
+        assert tempdir is not None
+        assert not tempdir.exists()
+
+
+# ---------------------------------------------------------------------------
+# parse() — happy path with Azure mock
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserParse:
+    def test_parse_returns_text_from_azure(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_settings: SettingsWrapper,
+        mocker: MockerFixture,
+    ) -> None:
+        mock_client = _make_azure_mock("Hello from Azure.")
+        mocker.patch(
+            "azure.ai.documentintelligence.DocumentIntelligenceClient",
+            return_value=mock_client,
        )

-    @override_settings(
-        REMOTE_OCR_ENGINE="azureai",
-        REMOTE_OCR_API_KEY="key",
-        REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
-    )
-    def test_supported_mime_types_valid_config(self) -> None:
-        parser = RemoteDocumentParser(uuid.uuid4())
-        expected_types = {
-            "application/pdf": ".pdf",
-            "image/png": ".png",
-            "image/jpeg": ".jpg",
-            "image/tiff": ".tiff",
-            "image/bmp": ".bmp",
-            "image/gif": ".gif",
-            "image/webp": ".webp",
-        }
-        self.assertEqual(parser.supported_mime_types(), expected_types)
+        remote_parser.parse(sample_pdf_file, "application/pdf")

-    def test_supported_mime_types_invalid_config(self) -> None:
-        parser = get_parser(uuid.uuid4())
-        self.assertEqual(parser.supported_mime_types(), {})
+        assert remote_parser.get_text() == "Hello from Azure."

-    @override_settings(
-        REMOTE_OCR_ENGINE=None,
-        REMOTE_OCR_API_KEY=None,
-        REMOTE_OCR_ENDPOINT=None,
-    )
-    def test_parse_with_invalid_config(self) -> None:
-        parser = get_parser(uuid.uuid4())
-        parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
-        self.assertEqual(parser.text, "")
+    def test_parse_sets_archive_path(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_settings: SettingsWrapper,
+        mocker: MockerFixture,
+    ) -> None:
+        mock_client = _make_azure_mock()
+        mocker.patch(
+            "azure.ai.documentintelligence.DocumentIntelligenceClient",
+            return_value=mock_client,
+        )
+
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        archive = remote_parser.get_archive_path()
+        assert archive is not None
+        assert archive.exists()
+        assert archive.suffix == ".pdf"
+
+    def test_parse_closes_client_on_success(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_settings: SettingsWrapper,
+        mocker: MockerFixture,
+    ) -> None:
+        mock_client = _make_azure_mock()
+        mocker.patch(
+            "azure.ai.documentintelligence.DocumentIntelligenceClient",
+            return_value=mock_client,
+        )
+
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        mock_client.close.assert_called_once()
+
+    def test_parse_sets_empty_text_when_not_configured(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        no_engine_settings: SettingsWrapper,
+    ) -> None:
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        assert remote_parser.get_text() == ""
+        assert remote_parser.get_archive_path() is None
+
+    def test_get_text_none_before_parse(
+        self,
+        remote_parser: RemoteDocumentParser,
+    ) -> None:
+        assert remote_parser.get_text() is None
+
+    def test_get_date_always_none(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_settings: SettingsWrapper,
+        mocker: MockerFixture,
+    ) -> None:
+        mock_client = _make_azure_mock()
+        mocker.patch(
+            "azure.ai.documentintelligence.DocumentIntelligenceClient",
+            return_value=mock_client,
+        )
+
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        assert remote_parser.get_date() is None
+
+
+# ---------------------------------------------------------------------------
+# parse() — Azure failure path
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserParseError:
+    def test_parse_returns_none_on_azure_error(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_settings: SettingsWrapper,
+        mocker: MockerFixture,
+    ) -> None:
+        mock_client = Mock()
+        mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
+        mocker.patch(
+            "azure.ai.documentintelligence.DocumentIntelligenceClient",
+            return_value=mock_client,
+        )
+
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        assert remote_parser.get_text() is None
+
+    def test_parse_closes_client_on_error(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_settings: SettingsWrapper,
+        mocker: MockerFixture,
+    ) -> None:
+        mock_client = Mock()
+        mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
+        mocker.patch(
+            "azure.ai.documentintelligence.DocumentIntelligenceClient",
+            return_value=mock_client,
+        )
+
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        mock_client.close.assert_called_once()
+
+    def test_parse_logs_error_on_azure_failure(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+        azure_settings: SettingsWrapper,
+        mocker: MockerFixture,
+    ) -> None:
+        mock_client = Mock()
+        mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
+        mocker.patch(
+            "azure.ai.documentintelligence.DocumentIntelligenceClient",
+            return_value=mock_client,
+        )
+        mock_log = mocker.patch("paperless.parsers.remote.logger")
+
+        remote_parser.parse(sample_pdf_file, "application/pdf")
+
+        mock_log.error.assert_called_once()
+        assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
+
+
+# ---------------------------------------------------------------------------
+# get_page_count()
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserPageCount:
+    def test_page_count_for_pdf(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
+        assert isinstance(count, int)
+        assert count >= 1
+
+    def test_page_count_returns_none_for_image_mime(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        count = remote_parser.get_page_count(sample_pdf_file, "image/png")
+        assert count is None
+
+    def test_page_count_returns_none_for_invalid_pdf(
+        self,
+        remote_parser: RemoteDocumentParser,
+        tmp_path: Path,
+    ) -> None:
+        bad_pdf = tmp_path / "bad.pdf"
+        bad_pdf.write_bytes(b"not a pdf at all")
+        count = remote_parser.get_page_count(bad_pdf, "application/pdf")
+        assert count is None
+
+
+# ---------------------------------------------------------------------------
+# extract_metadata()
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserMetadata:
+    def test_extract_metadata_non_pdf_returns_empty(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
+        assert result == []
+
+    def test_extract_metadata_pdf_returns_list(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
+        assert isinstance(result, list)
+
+    def test_extract_metadata_pdf_entries_have_required_keys(
+        self,
+        remote_parser: RemoteDocumentParser,
+        sample_pdf_file: Path,
+    ) -> None:
+        result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
+        for entry in result:
+            assert "namespace" in entry
+            assert "prefix" in entry
+            assert "key" in entry
+            assert "value" in entry
+            assert isinstance(entry["value"], str)
+
+    def test_extract_metadata_does_not_raise_on_invalid_pdf(
+        self,
+        remote_parser: RemoteDocumentParser,
+        tmp_path: Path,
+    ) -> None:
+        bad_pdf = tmp_path / "bad.pdf"
+        bad_pdf.write_bytes(b"not a pdf at all")
+        result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
+        assert result == []
+
+
+# ---------------------------------------------------------------------------
+# Registry integration
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteParserRegistry:
+    def test_registered_in_defaults(self) -> None:
+        from paperless.parsers.registry import ParserRegistry
+
+        registry = ParserRegistry()
+        registry.register_defaults()
+
+        assert RemoteDocumentParser in registry._builtins
+
+    def test_get_parser_returns_remote_when_configured(
+        self,
+        azure_settings: SettingsWrapper,
+    ) -> None:
+        from paperless.parsers.registry import get_parser_registry
+
+        registry = get_parser_registry()
+        parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
+
+        assert parser_cls is RemoteDocumentParser
+
+    def test_get_parser_returns_none_for_pdf_when_not_configured(
+        self,
+        no_engine_settings: SettingsWrapper,
+    ) -> None:
+        """With no tesseract parser registered yet, PDF has no handler if remote is off."""
+        from paperless.parsers.registry import ParserRegistry
+
+        registry = ParserRegistry()
+        registry.register_defaults()
+        parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
+
+        assert parser_cls is None