Fix: text parser get_parser forwards logging_group, drops progress_callback

TextDocumentParser.__init__ accepts logging_group: object = None, same as RemoteDocumentParser. The old shim incorrectly dropped it; fix to forward it as a positional arg and only drop progress_callback. Add type annotations and from __future__ import annotations for consistency with the remote parser signals shim. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Fix: get_parser factory forwards logging_group, drops progress_callback
2026-03-14 13:11:24 +00:00 · 2026-03-13 12:36:24 -07:00 · 2026-03-13 12:35:16 -07:00 · 2026-03-13 12:31:17 -07:00 · 2026-03-13 12:09:33 -07:00 · 2026-03-13 12:00:37 -07:00
26 changed files with 1361 additions and 868 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -157,9 +157,6 @@ updates:
      postgres:
        patterns:
          - "docker.io/library/postgres*"
      greenmail:
        patterns:
          - "docker.io/greenmail*"
  - package-ecosystem: "pre-commit" # See documentation for possible values
    directory: "/" # Location of package manifests
    schedule:
--- a/docker/compose/docker-compose.ci-test.yml
+++ b/docker/compose/docker-compose.ci-test.yml
@@ -18,13 +18,13 @@ services:
      - "--log-level=warn"
      - "--log-format=text"
  tika:
-    image: docker.io/apache/tika:3.2.3.0
+    image: docker.io/apache/tika:latest
    hostname: tika
    container_name: tika
    network_mode: host
    restart: unless-stopped
  greenmail:
-    image: docker.io/greenmail/standalone:2.1.8
+    image: greenmail/standalone:2.1.8
    hostname: greenmail
    container_name: greenmail
    environment:
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -51,8 +51,8 @@ from documents.templating.workflows import parse_w_workflow_placeholders
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
 from paperless.parsers.remote import RemoteDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
 from paperless_mail.parsers import MailDocumentParser
 LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -68,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
    TODO(stumpylog): Remove me in the future
    """
-    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
+    if isinstance(parser, (TextDocumentParser, RemoteDocumentParser)):
        parser.__exit__(None, None, None)
    else:
        parser.cleanup()
@@ -449,12 +449,6 @@ class ConsumerPlugin(
            progress_callback=progress_callback,
        )
        # New-style parsers use __enter__/__exit__ for resource management.
        # _parser_cleanup (below) handles __exit__; call __enter__ here.
        # TODO(stumpylog): Remove me in the future
        if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
            document_parser.__enter__()
        self.log.debug(f"Parser: {type(document_parser).__name__}")
        # Parse the document. This may take some time.
@@ -483,7 +477,10 @@ class ConsumerPlugin(
                    self.filename,
                    self.input_doc.mailrule_id,
                )
-            elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+            elif isinstance(
                document_parser,
                (TextDocumentParser, RemoteDocumentParser),
            ):
                # TODO(stumpylog): Remove me in the future
                document_parser.parse(self.working_copy, mime_type)
            else:
@@ -496,7 +493,7 @@ class ConsumerPlugin(
                ProgressStatusOptions.WORKING,
                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
            )
-            if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+            if isinstance(document_parser, (TextDocumentParser, RemoteDocumentParser)):
                # TODO(stumpylog): Remove me in the future
                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
            else:
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -10,8 +10,8 @@ from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import get_supported_file_extensions
 from documents.parsers import is_file_ext_supported
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
 from paperless_tesseract.parsers import RasterisedDocumentParser
 from paperless_tika.parsers import TikaDocumentParser
 class TestParserDiscovery(TestCase):
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -7,7 +7,6 @@ import tempfile
 import zipfile
 from collections import defaultdict
 from collections import deque
 from contextlib import nullcontext
 from datetime import datetime
 from pathlib import Path
 from time import mktime
@@ -226,7 +225,6 @@ from paperless.celery import app as celery_app
 from paperless.config import AIConfig
 from paperless.config import GeneralConfig
 from paperless.models import ApplicationConfiguration
 from paperless.parsers import ParserProtocol
 from paperless.serialisers import GroupSerializer
 from paperless.serialisers import UserSerializer
 from paperless.views import StandardPagination
@@ -1086,11 +1084,9 @@ class DocumentViewSet(
        parser_class = get_parser_class_for_mime_type(mime_type)
        if parser_class:
            parser = parser_class(progress_callback=None, logging_group=None)
            cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
            try:
-                with cm:
+                return parser.extract_metadata(file, mime_type)
                    return parser.extract_metadata(file, mime_type)
            except Exception:  # pragma: no cover
                logger.exception(f"Issue getting metadata for {file}")
                # TODO: cover GPG errors, remove later.
--- a/src/paperless/parsers/registry.py
+++ b/src/paperless/parsers/registry.py
@@ -193,11 +193,11 @@ class ParserRegistry:
        that log output is predictable; scoring determines which parser wins
        at runtime regardless of registration order.
        """
        from paperless.parsers.remote import RemoteDocumentParser
        from paperless.parsers.text import TextDocumentParser
        from paperless.parsers.tika import TikaDocumentParser
        self.register_builtin(TextDocumentParser)
-        self.register_builtin(TikaDocumentParser)
+        self.register_builtin(RemoteDocumentParser)
    # ------------------------------------------------------------------
    # Discovery
--- a/src/paperless/parsers/remote.py
+++ b/src/paperless/parsers/remote.py
@@ -0,0 +1,429 @@
 """
 Built-in remote-OCR document parser.
 Handles documents by sending them to a configured remote OCR engine
 (currently Azure AI Vision / Document Intelligence) and retrieving both
 the extracted text and a searchable PDF with an embedded text layer.
 When no engine is configured, ``score()`` returns ``None`` so the parser
 is effectively invisible to the registry — the tesseract parser handles
 these MIME types instead.
 """
 from __future__ import annotations
 import logging
 import shutil
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import Self
 from django.conf import settings
 from paperless.version import __full_version_str__
 if TYPE_CHECKING:
    import datetime
    from types import TracebackType
    from paperless.parsers import MetadataEntry
 logger = logging.getLogger("paperless.parsing.remote")
 _SUPPORTED_MIME_TYPES: dict[str, str] = {
    "application/pdf": ".pdf",
    "image/png": ".png",
    "image/jpeg": ".jpg",
    "image/tiff": ".tiff",
    "image/bmp": ".bmp",
    "image/gif": ".gif",
    "image/webp": ".webp",
 }
 class RemoteEngineConfig:
    """Holds and validates the remote OCR engine configuration."""
    def __init__(
        self,
        engine: str | None,
        api_key: str | None = None,
        endpoint: str | None = None,
    ) -> None:
        self.engine = engine
        self.api_key = api_key
        self.endpoint = endpoint
    def engine_is_valid(self) -> bool:
        """Return True when the engine is known and fully configured."""
        return (
            self.engine in ("azureai",)
            and self.api_key is not None
            and not (self.engine == "azureai" and self.endpoint is None)
        )
 class RemoteDocumentParser:
    """Parse documents via a remote OCR API (currently Azure AI Vision).
    This parser sends documents to a remote engine that returns both
    extracted text and a searchable PDF with an embedded text layer.
    It does not depend on Tesseract or ocrmypdf.
    Class attributes
    ----------------
    name : str
        Human-readable parser name.
    version : str
        Semantic version string, kept in sync with Paperless-ngx releases.
    author : str
        Maintainer name.
    url : str
        Issue tracker / source URL.
    """
    name: str = "Paperless-ngx Remote OCR Parser"
    version: str = __full_version_str__
    author: str = "Paperless-ngx Contributors"
    url: str = "https://github.com/paperless-ngx/paperless-ngx"
    # ------------------------------------------------------------------
    # Class methods
    # ------------------------------------------------------------------
    @classmethod
    def supported_mime_types(cls) -> dict[str, str]:
        """Return the MIME types this parser can handle.
        The full set is always returned regardless of whether a remote
        engine is configured.  The ``score()`` method handles the
        "am I active?" logic by returning ``None`` when not configured.
        Returns
        -------
        dict[str, str]
            Mapping of MIME type to preferred file extension.
        """
        return _SUPPORTED_MIME_TYPES
    @classmethod
    def score(
        cls,
        mime_type: str,
        filename: str,
        path: Path | None = None,
    ) -> int | None:
        """Return the priority score for handling this file, or None.
        Returns ``None`` when no valid remote engine is configured,
        making the parser invisible to the registry for this file.
        When configured, returns 20 — higher than the Tesseract parser's
        default of 10 — so the remote engine takes priority.
        Parameters
        ----------
        mime_type:
            Detected MIME type of the file.
        filename:
            Original filename including extension.
        path:
            Optional filesystem path. Not inspected by this parser.
        Returns
        -------
        int | None
            20 when the remote engine is configured and the MIME type is
            supported, otherwise None.
        """
        config = RemoteEngineConfig(
            engine=settings.REMOTE_OCR_ENGINE,
            api_key=settings.REMOTE_OCR_API_KEY,
            endpoint=settings.REMOTE_OCR_ENDPOINT,
        )
        if not config.engine_is_valid():
            return None
        if mime_type not in _SUPPORTED_MIME_TYPES:
            return None
        return 20
    # ------------------------------------------------------------------
    # Properties
    # ------------------------------------------------------------------
    @property
    def can_produce_archive(self) -> bool:
        """Whether this parser can produce a searchable PDF archive copy.
        Returns
        -------
        bool
            Always True — the remote engine always returns a PDF with an
            embedded text layer that serves as the archive copy.
        """
        return True
    @property
    def requires_pdf_rendition(self) -> bool:
        """Whether the parser must produce a PDF for the frontend to display.
        Returns
        -------
        bool
            Always False — all supported originals are displayable by
            the browser (PDF) or handled via the archive copy (images).
        """
        return False
    # ------------------------------------------------------------------
    # Lifecycle
    # ------------------------------------------------------------------
    def __init__(self, logging_group: object = None) -> None:
        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
        self._tempdir = Path(
            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
        )
        self._logging_group = logging_group
        self._text: str | None = None
        self._archive_path: Path | None = None
    def __enter__(self) -> Self:
        return self
    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
        logger.debug("Cleaning up temporary directory %s", self._tempdir)
        shutil.rmtree(self._tempdir, ignore_errors=True)
    # ------------------------------------------------------------------
    # Core parsing interface
    # ------------------------------------------------------------------
    def parse(
        self,
        document_path: Path,
        mime_type: str,
        *,
        produce_archive: bool = True,
    ) -> None:
        """Send the document to the remote engine and store results.
        Parameters
        ----------
        document_path:
            Absolute path to the document file to parse.
        mime_type:
            Detected MIME type of the document.
        produce_archive:
            Ignored — the remote engine always returns a searchable PDF,
            which is stored as the archive copy regardless of this flag.
        """
        config = RemoteEngineConfig(
            engine=settings.REMOTE_OCR_ENGINE,
            api_key=settings.REMOTE_OCR_API_KEY,
            endpoint=settings.REMOTE_OCR_ENDPOINT,
        )
        if not config.engine_is_valid():
            logger.warning(
                "No valid remote parser engine is configured, content will be empty.",
            )
            self._text = ""
            return
        if config.engine == "azureai":
            self._text = self._azure_ai_vision_parse(document_path, config)
    # ------------------------------------------------------------------
    # Result accessors
    # ------------------------------------------------------------------
    def get_text(self) -> str | None:
        """Return the plain-text content extracted during parse."""
        return self._text
    def get_date(self) -> datetime.datetime | None:
        """Return the document date detected during parse.
        Returns
        -------
        datetime.datetime | None
            Always None — the remote parser does not detect dates.
        """
        return None
    def get_archive_path(self) -> Path | None:
        """Return the path to the generated archive PDF, or None."""
        return self._archive_path
    # ------------------------------------------------------------------
    # Thumbnail and metadata
    # ------------------------------------------------------------------
    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
        """Generate a thumbnail image for the document.
        Uses the archive PDF produced by the remote engine when available,
        otherwise falls back to the original document path (PDF inputs).
        Parameters
        ----------
        document_path:
            Absolute path to the source document.
        mime_type:
            Detected MIME type of the document.
        Returns
        -------
        Path
            Path to the generated WebP thumbnail inside the temp directory.
        """
        # make_thumbnail_from_pdf lives in documents.parsers for now;
        # it will move to paperless.parsers.utils when the tesseract
        # parser is migrated in a later phase.
        from documents.parsers import make_thumbnail_from_pdf
        return make_thumbnail_from_pdf(
            self._archive_path or document_path,
            self._tempdir,
            self._logging_group,
        )
    def get_page_count(
        self,
        document_path: Path,
        mime_type: str,
    ) -> int | None:
        """Return the number of pages in a PDF document.
        Parameters
        ----------
        document_path:
            Absolute path to the source document.
        mime_type:
            Detected MIME type of the document.
        Returns
        -------
        int | None
            Page count for PDF inputs, or ``None`` for other MIME types.
        """
        if mime_type != "application/pdf":
            return None
        from paperless.parsers.utils import get_page_count_for_pdf
        return get_page_count_for_pdf(document_path, log=logger)
    def extract_metadata(
        self,
        document_path: Path,
        mime_type: str,
    ) -> list[MetadataEntry]:
        """Extract format-specific metadata from the document.
        Delegates to the shared pikepdf-based extractor for PDF files.
        Returns ``[]`` for all other MIME types.
        Parameters
        ----------
        document_path:
            Absolute path to the file to extract metadata from.
        mime_type:
            MIME type of the file.  May be ``"application/pdf"`` when
            called for the archive version of an image original.
        Returns
        -------
        list[MetadataEntry]
            Zero or more metadata entries.
        """
        if mime_type != "application/pdf":
            return []
        from paperless.parsers.utils import extract_pdf_metadata
        return extract_pdf_metadata(document_path, log=logger)
    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------
    def _azure_ai_vision_parse(
        self,
        file: Path,
        config: RemoteEngineConfig,
    ) -> str | None:
        """Send ``file`` to Azure AI Document Intelligence and return text.
        Downloads the searchable PDF output from Azure and stores it at
        ``self._archive_path``.  Returns the extracted text content, or
        ``None`` on failure (the error is logged).
        Parameters
        ----------
        file:
            Absolute path to the document to analyse.
        config:
            Validated remote engine configuration.
        Returns
        -------
        str | None
            Extracted text, or None if the Azure call failed.
        """
        if TYPE_CHECKING:
            # Callers must have already validated config via engine_is_valid():
            # engine_is_valid() asserts api_key is not None and (for azureai)
            # endpoint is not None, so these casts are provably safe.
            assert config.endpoint is not None
            assert config.api_key is not None
        from azure.ai.documentintelligence import DocumentIntelligenceClient
        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
        from azure.ai.documentintelligence.models import AnalyzeOutputOption
        from azure.ai.documentintelligence.models import DocumentContentFormat
        from azure.core.credentials import AzureKeyCredential
        client = DocumentIntelligenceClient(
            endpoint=config.endpoint,
            credential=AzureKeyCredential(config.api_key),
        )
        try:
            with file.open("rb") as f:
                analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
                poller = client.begin_analyze_document(
                    model_id="prebuilt-read",
                    body=analyze_request,
                    output_content_format=DocumentContentFormat.TEXT,
                    output=[AnalyzeOutputOption.PDF],
                    content_type="application/json",
                )
            poller.wait()
            result_id = poller.details["operation_id"]
            result = poller.result()
            self._archive_path = self._tempdir / "archive.pdf"
            with self._archive_path.open("wb") as f:
                for chunk in client.get_analyze_result_pdf(
                    model_id="prebuilt-read",
                    result_id=result_id,
                ):
                    f.write(chunk)
            return result.content
        except Exception as e:
            logger.error("Azure AI Vision parsing failed: %s", e)
        finally:
            client.close()
        return None
--- a/src/paperless/parsers/tika.py
+++ b/src/paperless/parsers/tika.py
@@ -1,440 +0,0 @@
 """
 Built-in Tika document parser.
 Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
 sending them to an Apache Tika server for text extraction and a Gotenberg
 server for PDF conversion.  Because the source formats cannot be rendered by
 a browser natively, the parser always produces a PDF rendition for display.
 """
 from __future__ import annotations
 import logging
 import shutil
 import tempfile
 from contextlib import ExitStack
 from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import Self
 import httpx
 from django.conf import settings
 from django.utils import timezone
 from gotenberg_client import GotenbergClient
 from gotenberg_client.options import PdfAFormat
 from tika_client import TikaClient
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from paperless.config import OutputTypeConfig
 from paperless.models import OutputTypeChoices
 from paperless.version import __full_version_str__
 if TYPE_CHECKING:
    import datetime
    from types import TracebackType
    from paperless.parsers import MetadataEntry
 logger = logging.getLogger("paperless.parsing.tika")
 _SUPPORTED_MIME_TYPES: dict[str, str] = {
    "application/msword": ".doc",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
    "application/vnd.ms-excel": ".xls",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
    "application/vnd.ms-powerpoint": ".ppt",
    "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
    "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
    "application/vnd.oasis.opendocument.presentation": ".odp",
    "application/vnd.oasis.opendocument.spreadsheet": ".ods",
    "application/vnd.oasis.opendocument.text": ".odt",
    "application/vnd.oasis.opendocument.graphics": ".odg",
    "text/rtf": ".rtf",
 }
 class TikaDocumentParser:
    """Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
    Text extraction is handled by the Tika server.  PDF conversion for display
    is handled by Gotenberg (LibreOffice route).  Because the source formats
    cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
    True and the PDF is always produced regardless of the ``produce_archive``
    flag passed to ``parse``.
    Both ``TikaClient`` and ``GotenbergClient`` are opened once in
    ``__enter__`` via an ``ExitStack`` and shared across ``parse``,
    ``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
    ``ExitStack.close()`` in ``__exit__``.  The parser must always be used
    as a context manager.
    Class attributes
    ----------------
    name : str
        Human-readable parser name.
    version : str
        Semantic version string, kept in sync with Paperless-ngx releases.
    author : str
        Maintainer name.
    url : str
        Issue tracker / source URL.
    """
    name: str = "Paperless-ngx Tika Parser"
    version: str = __full_version_str__
    author: str = "Paperless-ngx Contributors"
    url: str = "https://github.com/paperless-ngx/paperless-ngx"
    # ------------------------------------------------------------------
    # Class methods
    # ------------------------------------------------------------------
    @classmethod
    def supported_mime_types(cls) -> dict[str, str]:
        """Return the MIME types this parser handles.
        Returns
        -------
        dict[str, str]
            Mapping of MIME type to preferred file extension.
        """
        return _SUPPORTED_MIME_TYPES
    @classmethod
    def score(
        cls,
        mime_type: str,
        filename: str,
        path: Path | None = None,
    ) -> int | None:
        """Return the priority score for handling this file.
        Returns ``None`` when Tika integration is disabled so the registry
        skips this parser entirely.
        Parameters
        ----------
        mime_type:
            Detected MIME type of the file.
        filename:
            Original filename including extension.
        path:
            Optional filesystem path. Not inspected by this parser.
        Returns
        -------
        int | None
            10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
        """
        if not settings.TIKA_ENABLED:
            return None
        if mime_type in _SUPPORTED_MIME_TYPES:
            return 10
        return None
    # ------------------------------------------------------------------
    # Properties
    # ------------------------------------------------------------------
    @property
    def can_produce_archive(self) -> bool:
        """Whether this parser can produce a searchable PDF archive copy.
        Returns
        -------
        bool
            Always False — Tika produces a display PDF, not an OCR archive.
        """
        return False
    @property
    def requires_pdf_rendition(self) -> bool:
        """Whether the parser must produce a PDF for the frontend to display.
        Returns
        -------
        bool
            Always True — Office formats cannot be rendered natively in a
            browser, so a PDF conversion is always required for display.
        """
        return True
    # ------------------------------------------------------------------
    # Lifecycle
    # ------------------------------------------------------------------
    def __init__(self, logging_group: object = None) -> None:
        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
        self._tempdir = Path(
            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
        )
        self._text: str | None = None
        self._date: datetime.datetime | None = None
        self._archive_path: Path | None = None
        self._exit_stack = ExitStack()
        self._tika_client: TikaClient | None = None
        self._gotenberg_client: GotenbergClient | None = None
    def __enter__(self) -> Self:
        self._tika_client = self._exit_stack.enter_context(
            TikaClient(
                tika_url=settings.TIKA_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ),
        )
        self._gotenberg_client = self._exit_stack.enter_context(
            GotenbergClient(
                host=settings.TIKA_GOTENBERG_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ),
        )
        return self
    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
        self._exit_stack.close()
        logger.debug("Cleaning up temporary directory %s", self._tempdir)
        shutil.rmtree(self._tempdir, ignore_errors=True)
    # ------------------------------------------------------------------
    # Core parsing interface
    # ------------------------------------------------------------------
    def parse(
        self,
        document_path: Path,
        mime_type: str,
        *,
        produce_archive: bool = True,
    ) -> None:
        """Send the document to Tika for text extraction and Gotenberg for PDF.
        Because ``requires_pdf_rendition`` is True the PDF conversion is
        always performed — the ``produce_archive`` flag is intentionally
        ignored.
        Parameters
        ----------
        document_path:
            Absolute path to the document file to parse.
        mime_type:
            Detected MIME type of the document.
        produce_archive:
            Accepted for protocol compatibility but ignored; the PDF rendition
            is always produced since the source format cannot be displayed
            natively in the browser.
        Raises
        ------
        documents.parsers.ParseError
            If Tika or Gotenberg returns an error.
        """
        if TYPE_CHECKING:
            assert self._tika_client is not None
        logger.info("Sending %s to Tika server", document_path)
        try:
            try:
                parsed = self._tika_client.tika.as_text.from_file(
                    document_path,
                    mime_type,
                )
            except httpx.HTTPStatusError as err:
                # Workaround https://issues.apache.org/jira/browse/TIKA-4110
                # Tika fails with some files as multi-part form data
                if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
                    parsed = self._tika_client.tika.as_text.from_buffer(
                        document_path.read_bytes(),
                        mime_type,
                    )
                else:  # pragma: no cover
                    raise
        except Exception as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server at "
                f"{settings.TIKA_ENDPOINT}: {err}",
            ) from err
        self._text = parsed.content
        if self._text is not None:
            self._text = self._text.strip()
        self._date = parsed.created
        if self._date is not None and timezone.is_naive(self._date):
            self._date = timezone.make_aware(self._date)
        # Always convert — requires_pdf_rendition=True means the browser
        # cannot display the source format natively.
        self._archive_path = self._convert_to_pdf(document_path)
    # ------------------------------------------------------------------
    # Result accessors
    # ------------------------------------------------------------------
    def get_text(self) -> str | None:
        """Return the plain-text content extracted during parse.
        Returns
        -------
        str | None
            Extracted text, or None if parse has not been called yet.
        """
        return self._text
    def get_date(self) -> datetime.datetime | None:
        """Return the document date detected during parse.
        Returns
        -------
        datetime.datetime | None
            Creation date from Tika metadata, or None if not detected.
        """
        return self._date
    def get_archive_path(self) -> Path | None:
        """Return the path to the generated PDF rendition, or None.
        Returns
        -------
        Path | None
            Path to the PDF produced by Gotenberg, or None if parse has not
            been called yet.
        """
        return self._archive_path
    # ------------------------------------------------------------------
    # Thumbnail and metadata
    # ------------------------------------------------------------------
    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
        """Generate a thumbnail from the PDF rendition of the document.
        Converts the document to PDF first if not already done.
        Parameters
        ----------
        document_path:
            Absolute path to the source document.
        mime_type:
            Detected MIME type of the document.
        Returns
        -------
        Path
            Path to the generated WebP thumbnail inside the temporary directory.
        """
        if self._archive_path is None:
            self._archive_path = self._convert_to_pdf(document_path)
        return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
    def get_page_count(
        self,
        document_path: Path,
        mime_type: str,
    ) -> int | None:
        """Return the number of pages in the document.
        Returns
        -------
        int | None
            Always None — page count is not available from Tika.
        """
        return None
    def extract_metadata(
        self,
        document_path: Path,
        mime_type: str,
    ) -> list[MetadataEntry]:
        """Extract format-specific metadata via the Tika metadata endpoint.
        Returns
        -------
        list[MetadataEntry]
            All key/value pairs returned by Tika, or ``[]`` on error.
        """
        if TYPE_CHECKING:
            assert self._tika_client is not None
        try:
            parsed = self._tika_client.metadata.from_file(document_path, mime_type)
            return [
                {
                    "namespace": "",
                    "prefix": "",
                    "key": key,
                    "value": parsed.data[key],
                }
                for key in parsed.data
            ]
        except Exception as e:
            logger.warning(
                "Error while fetching document metadata for %s: %s",
                document_path,
                e,
            )
            return []
    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------
    def _convert_to_pdf(self, document_path: Path) -> Path:
        """Convert the document to PDF using Gotenberg's LibreOffice route.
        Parameters
        ----------
        document_path:
            Absolute path to the source document.
        Returns
        -------
        Path
            Path to the generated PDF inside the temporary directory.
        Raises
        ------
        documents.parsers.ParseError
            If Gotenberg returns an error.
        """
        if TYPE_CHECKING:
            assert self._gotenberg_client is not None
        pdf_path = self._tempdir / "convert.pdf"
        logger.info("Converting %s to PDF as %s", document_path, pdf_path)
        with self._gotenberg_client.libre_office.to_pdf() as route:
            # Set the output format of the resulting PDF.
            # OutputTypeConfig reads the database-stored ApplicationConfiguration
            # first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
            output_type = OutputTypeConfig().output_type
            if output_type in {
                OutputTypeChoices.PDF_A,
                OutputTypeChoices.PDF_A2,
            }:
                route.pdf_format(PdfAFormat.A2b)
            elif output_type == OutputTypeChoices.PDF_A1:
                logger.warning(
                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
                )
                route.pdf_format(PdfAFormat.A2b)
            elif output_type == OutputTypeChoices.PDF_A3:
                route.pdf_format(PdfAFormat.A3b)
            route.convert(document_path)
            try:
                response = route.run()
                pdf_path.write_bytes(response.content)
                return pdf_path
            except Exception as err:
                raise ParseError(
                    f"Error while converting document to PDF: {err}",
                ) from err
--- a/src/paperless/parsers/utils.py
+++ b/src/paperless/parsers/utils.py
@@ -0,0 +1,130 @@
 """
 Shared utilities for Paperless-ngx document parsers.
 Functions here are format-neutral helpers that multiple parsers need.
 Keeping them here avoids parsers inheriting from each other just to
 share implementation.
 """
 from __future__ import annotations
 import logging
 import re
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from pathlib import Path
    from paperless.parsers import MetadataEntry
 logger = logging.getLogger("paperless.parsers.utils")
 def get_page_count_for_pdf(
    document_path: Path,
    log: logging.Logger | None = None,
 ) -> int | None:
    """Return the number of pages in a PDF file using pikepdf.
    Parameters
    ----------
    document_path:
        Absolute path to the PDF file.
    log:
        Logger to use for warnings.  Falls back to the module-level logger
        when omitted.
    Returns
    -------
    int | None
        Page count, or ``None`` if the file cannot be opened or is not a
        valid PDF.
    """
    import pikepdf
    _log = log or logger
    try:
        with pikepdf.Pdf.open(document_path) as pdf:
            return len(pdf.pages)
    except Exception as e:
        _log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
        return None
 def extract_pdf_metadata(
    document_path: Path,
    log: logging.Logger | None = None,
 ) -> list[MetadataEntry]:
    """Extract XMP/PDF metadata from a PDF file using pikepdf.
    Reads all XMP metadata entries from the document and returns them as a
    list of ``MetadataEntry`` dicts.  The method never raises — any failure
    to open the file or read a specific key is logged and skipped.
    Parameters
    ----------
    document_path:
        Absolute path to the PDF file.
    log:
        Logger to use for warnings and debug messages.  Falls back to the
        module-level logger when omitted.
    Returns
    -------
    list[MetadataEntry]
        Zero or more metadata entries.  Returns ``[]`` if the file cannot
        be opened or contains no readable XMP metadata.
    """
    import pikepdf
    from paperless.parsers import MetadataEntry
    _log = log or logger
    result: list[MetadataEntry] = []
    namespace_pattern = re.compile(r"\{(.*)\}(.*)")
    try:
        pdf = pikepdf.open(document_path)
        meta = pdf.open_metadata()
    except Exception as e:
        _log.warning("Could not open PDF metadata for %s: %s", document_path, e)
        return []
    for key, value in meta.items():
        if isinstance(value, list):
            value = " ".join(str(e) for e in value)
        value = str(value)
        try:
            m = namespace_pattern.match(key)
            if m is None:
                continue
            namespace = m.group(1)
            key_value = m.group(2)
            try:
                namespace.encode("utf-8")
                key_value.encode("utf-8")
            except UnicodeEncodeError as enc_err:
                _log.debug("Skipping metadata key %s: %s", key, enc_err)
                continue
            result.append(
                MetadataEntry(
                    namespace=namespace,
                    prefix=meta.REVERSE_NS[namespace],
                    key=key_value,
                    value=value,
                ),
            )
        except Exception as e:
            _log.warning(
                "Error reading metadata key %s value %s: %s",
                key,
                value,
                e,
            )
    return result
--- a/src/paperless/tests/parsers/conftest.py
+++ b/src/paperless/tests/parsers/conftest.py
@@ -10,13 +10,15 @@ from typing import TYPE_CHECKING
 import pytest
 from paperless.parsers.remote import RemoteDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
 if TYPE_CHECKING:
    from collections.abc import Generator
    from pathlib import Path
    from pytest_django.fixtures import SettingsWrapper
 # ------------------------------------------------------------------
 # Text parser sample files
@@ -78,83 +80,86 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
 # ------------------------------------------------------------------
-# Tika parser sample files
+# Remote parser sample files
 # ------------------------------------------------------------------
@pytest.fixture(scope="session")
-def tika_samples_dir(samples_dir: Path) -> Path:
+def remote_samples_dir(samples_dir: Path) -> Path:
-    """Absolute path to the Tika parser sample files directory.
+    """Absolute path to the remote parser sample files directory.
    Returns
    -------
    Path
-        ``<samples_dir>/tika/``
+        ``<samples_dir>/remote/``
    """
-    return samples_dir / "tika"
+    return samples_dir / "remote"
@pytest.fixture(scope="session")
-def sample_odt_file(tika_samples_dir: Path) -> Path:
+def sample_pdf_file(remote_samples_dir: Path) -> Path:
-    """Path to a sample ODT file.
+    """Path to a simple digital PDF sample file.
    Returns
    -------
    Path
-        Absolute path to ``tika/sample.odt``.
+        Absolute path to ``remote/simple-digital.pdf``.
    """
-    return tika_samples_dir / "sample.odt"
+    return remote_samples_dir / "simple-digital.pdf"
@pytest.fixture(scope="session")
 def sample_docx_file(tika_samples_dir: Path) -> Path:
    """Path to a sample DOCX file.
    Returns
    -------
    Path
        Absolute path to ``tika/sample.docx``.
    """
    return tika_samples_dir / "sample.docx"
@pytest.fixture(scope="session")
 def sample_doc_file(tika_samples_dir: Path) -> Path:
    """Path to a sample DOC file.
    Returns
    -------
    Path
        Absolute path to ``tika/sample.doc``.
    """
    return tika_samples_dir / "sample.doc"
@pytest.fixture(scope="session")
 def sample_broken_odt(tika_samples_dir: Path) -> Path:
    """Path to a broken ODT file that triggers the multi-part fallback.
    Returns
    -------
    Path
        Absolute path to ``tika/multi-part-broken.odt``.
    """
    return tika_samples_dir / "multi-part-broken.odt"
 # ------------------------------------------------------------------
-# Tika parser instance
+# Remote parser instance
 # ------------------------------------------------------------------
@pytest.fixture()
-def tika_parser() -> Generator[TikaDocumentParser, None, None]:
+def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
-    """Yield a TikaDocumentParser and clean up its temporary directory afterwards.
+    """Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
    Yields
    ------
-    TikaDocumentParser
+    RemoteDocumentParser
        A ready-to-use parser instance.
    """
-    with TikaDocumentParser() as parser:
+    with RemoteDocumentParser() as parser:
        yield parser
 # ------------------------------------------------------------------
 # Remote parser settings helpers
 # ------------------------------------------------------------------
@pytest.fixture()
 def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
    """Configure Django settings for a valid Azure AI OCR engine.
    Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
    ``REMOTE_OCR_ENDPOINT`` to test values.  Settings are restored
    automatically after the test by pytest-django.
    Returns
    -------
    SettingsWrapper
        The modified settings object (for chaining further overrides).
    """
    settings.REMOTE_OCR_ENGINE = "azureai"
    settings.REMOTE_OCR_API_KEY = "test-api-key"
    settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
    return settings
@pytest.fixture()
 def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
    """Configure Django settings with no remote engine configured.
    Returns
    -------
    SettingsWrapper
        The modified settings object.
    """
    settings.REMOTE_OCR_ENGINE = None
    settings.REMOTE_OCR_API_KEY = None
    settings.REMOTE_OCR_ENDPOINT = None
    return settings
--- a/src/paperless/tests/parsers/test_remote_parser.py
+++ b/src/paperless/tests/parsers/test_remote_parser.py
@@ -0,0 +1,490 @@
 """
 Tests for paperless.parsers.remote.RemoteDocumentParser.
 All tests use the context-manager protocol for parser lifecycle.
 Fixture layout
 --------------
 make_azure_mock  — factory (defined here; specific to this module)
 azure_client     — composes azure_settings + make_azure_mock + patch;
                   use when a test needs the client to succeed
 failing_azure_client
                 — composes azure_settings + patch with RuntimeError;
                   use when a test needs the client to fail
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 from unittest.mock import Mock
 import pytest
 from paperless.parsers import ParserProtocol
 from paperless.parsers.remote import RemoteDocumentParser
 if TYPE_CHECKING:
    from collections.abc import Callable
    from pathlib import Path
    from pytest_django.fixtures import SettingsWrapper
    from pytest_mock import MockerFixture
 # ---------------------------------------------------------------------------
 # Module-local fixtures
 # ---------------------------------------------------------------------------
 _AZURE_CLIENT_TARGET = "azure.ai.documentintelligence.DocumentIntelligenceClient"
 _DEFAULT_TEXT = "Extracted text."
@pytest.fixture()
 def make_azure_mock() -> Callable[[str], Mock]:
    """Return a factory that builds a mock Azure DocumentIntelligenceClient.
    Usage::
        mock_client = make_azure_mock()            # default extracted text
        mock_client = make_azure_mock("My text.")  # custom extracted text
    """
    def _factory(text: str = _DEFAULT_TEXT) -> Mock:
        mock_client = Mock()
        mock_poller = Mock()
        mock_poller.wait.return_value = None
        mock_poller.details = {"operation_id": "fake-op-id"}
        mock_poller.result.return_value.content = text
        mock_client.begin_analyze_document.return_value = mock_poller
        mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
        return mock_client
    return _factory
@pytest.fixture()
 def azure_client(
    azure_settings: SettingsWrapper,
    make_azure_mock: Callable[[str], Mock],
    mocker: MockerFixture,
 ) -> Mock:
    """Patch the Azure DI client with a succeeding mock and return the instance.
    Implicitly applies ``azure_settings`` so tests using this fixture do not
    also need ``@pytest.mark.usefixtures("azure_settings")``.
    """
    mock_client = make_azure_mock()
    mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
    return mock_client
@pytest.fixture()
 def failing_azure_client(
    azure_settings: SettingsWrapper,
    mocker: MockerFixture,
 ) -> Mock:
    """Patch the Azure DI client to raise RuntimeError on every call.
    Implicitly applies ``azure_settings``.  Returns the mock instance so
    tests can assert on calls such as ``close()``.
    """
    mock_client = Mock()
    mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
    mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
    return mock_client
 # ---------------------------------------------------------------------------
 # Protocol contract
 # ---------------------------------------------------------------------------
 class TestRemoteParserProtocol:
    """Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""
    def test_isinstance_satisfies_protocol(
        self,
        remote_parser: RemoteDocumentParser,
    ) -> None:
        assert isinstance(remote_parser, ParserProtocol)
    def test_class_attributes_present(self) -> None:
        assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
        assert (
            isinstance(RemoteDocumentParser.version, str)
            and RemoteDocumentParser.version
        )
        assert (
            isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
        )
        assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url
 # ---------------------------------------------------------------------------
 # supported_mime_types
 # ---------------------------------------------------------------------------
 class TestRemoteParserSupportedMimeTypes:
    """supported_mime_types() always returns the full set regardless of config."""
    def test_returns_dict(self) -> None:
        mime_types = RemoteDocumentParser.supported_mime_types()
        assert isinstance(mime_types, dict)
    def test_includes_all_expected_types(self) -> None:
        mime_types = RemoteDocumentParser.supported_mime_types()
        expected = {
            "application/pdf",
            "image/png",
            "image/jpeg",
            "image/tiff",
            "image/bmp",
            "image/gif",
            "image/webp",
        }
        assert expected == set(mime_types.keys())
    @pytest.mark.usefixtures("no_engine_settings")
    def test_returns_full_set_when_not_configured(self) -> None:
        """
        GIVEN: No remote engine is configured
        WHEN:  supported_mime_types() is called
        THEN:  The full MIME type dict is still returned (score() handles activation)
        """
        mime_types = RemoteDocumentParser.supported_mime_types()
        assert len(mime_types) == 7
 # ---------------------------------------------------------------------------
 # score()
 # ---------------------------------------------------------------------------
 class TestRemoteParserScore:
    """score() encodes the activation logic: None when unconfigured, 20 when active."""
    @pytest.mark.usefixtures("azure_settings")
    @pytest.mark.parametrize(
        "mime_type",
        [
            pytest.param("application/pdf", id="pdf"),
            pytest.param("image/png", id="png"),
            pytest.param("image/jpeg", id="jpeg"),
            pytest.param("image/tiff", id="tiff"),
            pytest.param("image/bmp", id="bmp"),
            pytest.param("image/gif", id="gif"),
            pytest.param("image/webp", id="webp"),
        ],
    )
    def test_score_returns_20_when_configured(self, mime_type: str) -> None:
        result = RemoteDocumentParser.score(mime_type, "doc.pdf")
        assert result == 20
    @pytest.mark.usefixtures("no_engine_settings")
    @pytest.mark.parametrize(
        "mime_type",
        [
            pytest.param("application/pdf", id="pdf"),
            pytest.param("image/png", id="png"),
            pytest.param("image/jpeg", id="jpeg"),
        ],
    )
    def test_score_returns_none_when_no_engine(self, mime_type: str) -> None:
        result = RemoteDocumentParser.score(mime_type, "doc.pdf")
        assert result is None
    def test_score_returns_none_when_api_key_missing(
        self,
        settings: SettingsWrapper,
    ) -> None:
        settings.REMOTE_OCR_ENGINE = "azureai"
        settings.REMOTE_OCR_API_KEY = None
        settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
        result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
        assert result is None
    def test_score_returns_none_when_endpoint_missing(
        self,
        settings: SettingsWrapper,
    ) -> None:
        settings.REMOTE_OCR_ENGINE = "azureai"
        settings.REMOTE_OCR_API_KEY = "key"
        settings.REMOTE_OCR_ENDPOINT = None
        result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
        assert result is None
    @pytest.mark.usefixtures("azure_settings")
    def test_score_returns_none_for_unsupported_mime_type(self) -> None:
        result = RemoteDocumentParser.score("text/plain", "doc.txt")
        assert result is None
    @pytest.mark.usefixtures("azure_settings")
    def test_score_higher_than_tesseract_default(self) -> None:
        """Remote parser (20) outranks the tesseract default (10) when configured."""
        score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
        assert score is not None and score > 10
 # ---------------------------------------------------------------------------
 # Properties
 # ---------------------------------------------------------------------------
 class TestRemoteParserProperties:
    def test_can_produce_archive_is_true(
        self,
        remote_parser: RemoteDocumentParser,
    ) -> None:
        assert remote_parser.can_produce_archive is True
    def test_requires_pdf_rendition_is_false(
        self,
        remote_parser: RemoteDocumentParser,
    ) -> None:
        assert remote_parser.requires_pdf_rendition is False
 # ---------------------------------------------------------------------------
 # Lifecycle
 # ---------------------------------------------------------------------------
 class TestRemoteParserLifecycle:
    def test_context_manager_cleans_up_tempdir(self) -> None:
        with RemoteDocumentParser() as parser:
            tempdir = parser._tempdir
            assert tempdir.exists()
        assert not tempdir.exists()
    def test_context_manager_cleans_up_after_exception(self) -> None:
        tempdir: Path | None = None
        with pytest.raises(RuntimeError):
            with RemoteDocumentParser() as parser:
                tempdir = parser._tempdir
                raise RuntimeError("boom")
        assert tempdir is not None
        assert not tempdir.exists()
 # ---------------------------------------------------------------------------
 # parse() — happy path
 # ---------------------------------------------------------------------------
 class TestRemoteParserParse:
    def test_parse_returns_text_from_azure(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
        azure_client: Mock,
    ) -> None:
        remote_parser.parse(sample_pdf_file, "application/pdf")
        assert remote_parser.get_text() == _DEFAULT_TEXT
    def test_parse_sets_archive_path(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
        azure_client: Mock,
    ) -> None:
        remote_parser.parse(sample_pdf_file, "application/pdf")
        archive = remote_parser.get_archive_path()
        assert archive is not None
        assert archive.exists()
        assert archive.suffix == ".pdf"
    def test_parse_closes_client_on_success(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
        azure_client: Mock,
    ) -> None:
        remote_parser.parse(sample_pdf_file, "application/pdf")
        azure_client.close.assert_called_once()
    @pytest.mark.usefixtures("no_engine_settings")
    def test_parse_sets_empty_text_when_not_configured(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
    ) -> None:
        remote_parser.parse(sample_pdf_file, "application/pdf")
        assert remote_parser.get_text() == ""
        assert remote_parser.get_archive_path() is None
    def test_get_text_none_before_parse(
        self,
        remote_parser: RemoteDocumentParser,
    ) -> None:
        assert remote_parser.get_text() is None
    def test_get_date_always_none(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
        azure_client: Mock,
    ) -> None:
        remote_parser.parse(sample_pdf_file, "application/pdf")
        assert remote_parser.get_date() is None
 # ---------------------------------------------------------------------------
 # parse() — Azure failure path
 # ---------------------------------------------------------------------------
 class TestRemoteParserParseError:
    def test_parse_returns_none_on_azure_error(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
        failing_azure_client: Mock,
    ) -> None:
        remote_parser.parse(sample_pdf_file, "application/pdf")
        assert remote_parser.get_text() is None
    def test_parse_closes_client_on_error(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
        failing_azure_client: Mock,
    ) -> None:
        remote_parser.parse(sample_pdf_file, "application/pdf")
        failing_azure_client.close.assert_called_once()
    def test_parse_logs_error_on_azure_failure(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
        failing_azure_client: Mock,
        mocker: MockerFixture,
    ) -> None:
        mock_log = mocker.patch("paperless.parsers.remote.logger")
        remote_parser.parse(sample_pdf_file, "application/pdf")
        mock_log.error.assert_called_once()
        assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
 # ---------------------------------------------------------------------------
 # get_page_count()
 # ---------------------------------------------------------------------------
 class TestRemoteParserPageCount:
    def test_page_count_for_pdf(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
    ) -> None:
        count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
        assert isinstance(count, int)
        assert count >= 1
    def test_page_count_returns_none_for_image_mime(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
    ) -> None:
        count = remote_parser.get_page_count(sample_pdf_file, "image/png")
        assert count is None
    def test_page_count_returns_none_for_invalid_pdf(
        self,
        remote_parser: RemoteDocumentParser,
        tmp_path: Path,
    ) -> None:
        bad_pdf = tmp_path / "bad.pdf"
        bad_pdf.write_bytes(b"not a pdf at all")
        count = remote_parser.get_page_count(bad_pdf, "application/pdf")
        assert count is None
 # ---------------------------------------------------------------------------
 # extract_metadata()
 # ---------------------------------------------------------------------------
 class TestRemoteParserMetadata:
    def test_extract_metadata_non_pdf_returns_empty(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
    ) -> None:
        result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
        assert result == []
    def test_extract_metadata_pdf_returns_list(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
    ) -> None:
        result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
        assert isinstance(result, list)
    def test_extract_metadata_pdf_entries_have_required_keys(
        self,
        remote_parser: RemoteDocumentParser,
        sample_pdf_file: Path,
    ) -> None:
        result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
        for entry in result:
            assert "namespace" in entry
            assert "prefix" in entry
            assert "key" in entry
            assert "value" in entry
            assert isinstance(entry["value"], str)
    def test_extract_metadata_does_not_raise_on_invalid_pdf(
        self,
        remote_parser: RemoteDocumentParser,
        tmp_path: Path,
    ) -> None:
        bad_pdf = tmp_path / "bad.pdf"
        bad_pdf.write_bytes(b"not a pdf at all")
        result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
        assert result == []
 # ---------------------------------------------------------------------------
 # Registry integration
 # ---------------------------------------------------------------------------
 class TestRemoteParserRegistry:
    def test_registered_in_defaults(self) -> None:
        from paperless.parsers.registry import ParserRegistry
        registry = ParserRegistry()
        registry.register_defaults()
        assert RemoteDocumentParser in registry._builtins
    @pytest.mark.usefixtures("azure_settings")
    def test_get_parser_returns_remote_when_configured(self) -> None:
        from paperless.parsers.registry import get_parser_registry
        registry = get_parser_registry()
        parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
        assert parser_cls is RemoteDocumentParser
    @pytest.mark.usefixtures("no_engine_settings")
    def test_get_parser_returns_none_for_pdf_when_not_configured(self) -> None:
        """With no tesseract parser registered yet, PDF has no handler if remote is off."""
        from paperless.parsers.registry import ParserRegistry
        registry = ParserRegistry()
        registry.register_defaults()
        parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
        assert parser_cls is None
--- a/src/paperless/tests/samples/remote/simple-digital.pdf
+++ b/src/paperless/tests/samples/remote/simple-digital.pdf
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@@ -1,118 +0,0 @@
 from pathlib import Path
 from django.conf import settings
 from paperless_tesseract.parsers import RasterisedDocumentParser
 class RemoteEngineConfig:
    def __init__(
        self,
        engine: str,
        api_key: str | None = None,
        endpoint: str | None = None,
    ):
        self.engine = engine
        self.api_key = api_key
        self.endpoint = endpoint
    def engine_is_valid(self):
        valid = self.engine in ["azureai"] and self.api_key is not None
        if self.engine == "azureai":
            valid = valid and self.endpoint is not None
        return valid
 class RemoteDocumentParser(RasterisedDocumentParser):
    """
    This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
    as this is the only service that provides a remote OCR API with text-embedded PDF output.
    """
    logging_name = "paperless.parsing.remote"
    def get_settings(self) -> RemoteEngineConfig:
        """
        Returns the configuration for the remote OCR engine, loaded from Django settings.
        """
        return RemoteEngineConfig(
            engine=settings.REMOTE_OCR_ENGINE,
            api_key=settings.REMOTE_OCR_API_KEY,
            endpoint=settings.REMOTE_OCR_ENDPOINT,
        )
    def supported_mime_types(self):
        if self.settings.engine_is_valid():
            return {
                "application/pdf": ".pdf",
                "image/png": ".png",
                "image/jpeg": ".jpg",
                "image/tiff": ".tiff",
                "image/bmp": ".bmp",
                "image/gif": ".gif",
                "image/webp": ".webp",
            }
        else:
            return {}
    def azure_ai_vision_parse(
        self,
        file: Path,
    ) -> str | None:
        """
        Uses Azure AI Vision to parse the document and return the text content.
        It requests a searchable PDF output with embedded text.
        The PDF is saved to the archive_path attribute.
        Returns the text content extracted from the document.
        If the parsing fails, it returns None.
        """
        from azure.ai.documentintelligence import DocumentIntelligenceClient
        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
        from azure.ai.documentintelligence.models import AnalyzeOutputOption
        from azure.ai.documentintelligence.models import DocumentContentFormat
        from azure.core.credentials import AzureKeyCredential
        client = DocumentIntelligenceClient(
            endpoint=self.settings.endpoint,
            credential=AzureKeyCredential(self.settings.api_key),
        )
        try:
            with file.open("rb") as f:
                analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
                poller = client.begin_analyze_document(
                    model_id="prebuilt-read",
                    body=analyze_request,
                    output_content_format=DocumentContentFormat.TEXT,
                    output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
                    content_type="application/json",
                )
            poller.wait()
            result_id = poller.details["operation_id"]
            result = poller.result()
            # Download the PDF with embedded text
            self.archive_path = self.tempdir / "archive.pdf"
            with self.archive_path.open("wb") as f:
                for chunk in client.get_analyze_result_pdf(
                    model_id="prebuilt-read",
                    result_id=result_id,
                ):
                    f.write(chunk)
            return result.content
        except Exception as e:
            self.log.error(f"Azure AI Vision parsing failed: {e}")
        finally:
            client.close()
        return None
    def parse(self, document_path: Path, mime_type, file_name=None):
        if not self.settings.engine_is_valid():
            self.log.warning(
                "No valid remote parser engine is configured, content will be empty.",
            )
            self.text = ""
        elif self.settings.engine == "azureai":
            self.text = self.azure_ai_vision_parse(document_path)
--- a/src/paperless_remote/signals.py
+++ b/src/paperless_remote/signals.py
@@ -1,16 +1,36 @@
-def get_parser(*args, **kwargs):
+from __future__ import annotations
    from paperless_remote.parsers import RemoteDocumentParser
 from typing import Any
 def get_parser(*args: Any, **kwargs: Any) -> Any:
    from paperless.parsers.remote import RemoteDocumentParser
    # The new RemoteDocumentParser does not accept the progress_callback
    # kwarg injected by the old signal-based consumer.  logging_group is
    # forwarded as a positional arg.
    # Phase 4 will replace this signal path with the new ParserRegistry.
    kwargs.pop("progress_callback", None)
    return RemoteDocumentParser(*args, **kwargs)
-def get_supported_mime_types():
+def get_supported_mime_types() -> dict[str, str]:
-    from paperless_remote.parsers import RemoteDocumentParser
+    from django.conf import settings
-    return RemoteDocumentParser(None).supported_mime_types()
+    from paperless.parsers.remote import RemoteDocumentParser
    from paperless.parsers.remote import RemoteEngineConfig
    config = RemoteEngineConfig(
        engine=settings.REMOTE_OCR_ENGINE,
        api_key=settings.REMOTE_OCR_API_KEY,
        endpoint=settings.REMOTE_OCR_ENDPOINT,
    )
    if not config.engine_is_valid():
        return {}
    return RemoteDocumentParser.supported_mime_types()
-def remote_consumer_declaration(sender, **kwargs):
+def remote_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
    return {
        "parser": get_parser,
        "weight": 5,
--- a/src/paperless_remote/tests/test_parser.py
+++ b/src/paperless_remote/tests/test_parser.py
@@ -1,131 +0,0 @@
 import uuid
 from pathlib import Path
 from unittest import mock
 from django.test import TestCase
 from django.test import override_settings
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import FileSystemAssertsMixin
 from paperless_remote.parsers import RemoteDocumentParser
 from paperless_remote.signals import get_parser
 class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
    def assertContainsStrings(self, content: str, strings: list[str]) -> None:
        # Asserts that all strings appear in content, in the given order.
        indices = []
        for s in strings:
            if s in content:
                indices.append(content.index(s))
            else:
                self.fail(f"'{s}' is not in '{content}'")
        self.assertListEqual(indices, sorted(indices))
    @mock.patch("paperless_tesseract.parsers.run_subprocess")
    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
    def test_get_text_with_azure(self, mock_client_cls, mock_subprocess) -> None:
        # Arrange mock Azure client
        mock_client = mock.Mock()
        mock_client_cls.return_value = mock_client
        # Simulate poller result and its `.details`
        mock_poller = mock.Mock()
        mock_poller.wait.return_value = None
        mock_poller.details = {"operation_id": "fake-op-id"}
        mock_client.begin_analyze_document.return_value = mock_poller
        mock_poller.result.return_value.content = "This is a test document."
        # Return dummy PDF bytes
        mock_client.get_analyze_result_pdf.return_value = [
            b"%PDF-",
            b"1.7 ",
            b"FAKEPDF",
        ]
        # Simulate pdftotext by writing dummy text to sidecar file
        def fake_run(cmd, *args, **kwargs) -> None:
            with Path(cmd[-1]).open("w", encoding="utf-8") as f:
                f.write("This is a test document.")
        mock_subprocess.side_effect = fake_run
        with override_settings(
            REMOTE_OCR_ENGINE="azureai",
            REMOTE_OCR_API_KEY="somekey",
            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
        ):
            parser = get_parser(uuid.uuid4())
            parser.parse(
                self.SAMPLE_FILES / "simple-digital.pdf",
                "application/pdf",
            )
            self.assertContainsStrings(
                parser.text.strip(),
                ["This is a test document."],
            )
    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
    def test_get_text_with_azure_error_logged_and_returns_none(
        self,
        mock_client_cls,
    ) -> None:
        mock_client = mock.Mock()
        mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
        mock_client_cls.return_value = mock_client
        with override_settings(
            REMOTE_OCR_ENGINE="azureai",
            REMOTE_OCR_API_KEY="somekey",
            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
        ):
            parser = get_parser(uuid.uuid4())
            with mock.patch.object(parser.log, "error") as mock_log_error:
                parser.parse(
                    self.SAMPLE_FILES / "simple-digital.pdf",
                    "application/pdf",
                )
        self.assertIsNone(parser.text)
        mock_client.begin_analyze_document.assert_called_once()
        mock_client.close.assert_called_once()
        mock_log_error.assert_called_once()
        self.assertIn(
            "Azure AI Vision parsing failed",
            mock_log_error.call_args[0][0],
        )
    @override_settings(
        REMOTE_OCR_ENGINE="azureai",
        REMOTE_OCR_API_KEY="key",
        REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
    )
    def test_supported_mime_types_valid_config(self) -> None:
        parser = RemoteDocumentParser(uuid.uuid4())
        expected_types = {
            "application/pdf": ".pdf",
            "image/png": ".png",
            "image/jpeg": ".jpg",
            "image/tiff": ".tiff",
            "image/bmp": ".bmp",
            "image/gif": ".gif",
            "image/webp": ".webp",
        }
        self.assertEqual(parser.supported_mime_types(), expected_types)
    def test_supported_mime_types_invalid_config(self) -> None:
        parser = get_parser(uuid.uuid4())
        self.assertEqual(parser.supported_mime_types(), {})
    @override_settings(
        REMOTE_OCR_ENGINE=None,
        REMOTE_OCR_API_KEY=None,
        REMOTE_OCR_ENDPOINT=None,
    )
    def test_parse_with_invalid_config(self) -> None:
        parser = get_parser(uuid.uuid4())
        parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
        self.assertEqual(parser.text, "")
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -1,18 +1,20 @@
-def get_parser(*args, **kwargs):
+from __future__ import annotations
 from typing import Any
 def get_parser(*args: Any, **kwargs: Any) -> Any:
    from paperless.parsers.text import TextDocumentParser
-    # TextDocumentParser accepts logging_group for constructor compatibility but
+    # The new TextDocumentParser does not accept the progress_callback
-    # does not store or use it (no legacy DocumentParser base class).
+    # kwarg injected by the old signal-based consumer.  logging_group is
-    # progress_callback is also not used.  Both may arrive as a positional arg
+    # forwarded as a positional arg.
-    # (consumer) or a keyword arg (views); *args absorbs the positional form,
+    # Phase 4 will replace this signal path with the new ParserRegistry.
    # kwargs.pop handles the keyword form.  Phase 4 will replace this signal
    # path with the new ParserRegistry so the shim can be removed at that point.
    kwargs.pop("logging_group", None)
    kwargs.pop("progress_callback", None)
-    return TextDocumentParser()
+    return TextDocumentParser(*args, **kwargs)
-def text_consumer_declaration(sender, **kwargs):
+def text_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
    return {
        "parser": get_parser,
        "weight": 10,
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -0,0 +1,136 @@
 from pathlib import Path
 import httpx
 from django.conf import settings
 from django.utils import timezone
 from gotenberg_client import GotenbergClient
 from gotenberg_client.options import PdfAFormat
 from tika_client import TikaClient
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from paperless.config import OutputTypeConfig
 from paperless.models import OutputTypeChoices
 class TikaDocumentParser(DocumentParser):
    """
    This parser sends documents to a local tika server
    """
    logging_name = "paperless.parsing.tika"
    def get_thumbnail(self, document_path, mime_type, file_name=None):
        if not self.archive_path:
            self.archive_path = self.convert_to_pdf(document_path, file_name)
        return make_thumbnail_from_pdf(
            self.archive_path,
            self.tempdir,
            self.logging_group,
        )
    def extract_metadata(self, document_path, mime_type):
        try:
            with TikaClient(
                tika_url=settings.TIKA_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ) as client:
                parsed = client.metadata.from_file(document_path, mime_type)
                return [
                    {
                        "namespace": "",
                        "prefix": "",
                        "key": key,
                        "value": parsed.data[key],
                    }
                    for key in parsed.data
                ]
        except Exception as e:
            self.log.warning(
                f"Error while fetching document metadata for {document_path}: {e}",
            )
            return []
    def parse(self, document_path: Path, mime_type: str, file_name=None) -> None:
        self.log.info(f"Sending {document_path} to Tika server")
        try:
            with TikaClient(
                tika_url=settings.TIKA_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ) as client:
                try:
                    parsed = client.tika.as_text.from_file(document_path, mime_type)
                except httpx.HTTPStatusError as err:
                    # Workaround https://issues.apache.org/jira/browse/TIKA-4110
                    # Tika fails with some files as multi-part form data
                    if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
                        parsed = client.tika.as_text.from_buffer(
                            document_path.read_bytes(),
                            mime_type,
                        )
                    else:  # pragma: no cover
                        raise
        except Exception as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server at "
                f"{settings.TIKA_ENDPOINT}: {err}",
            ) from err
        self.text = parsed.content
        if self.text is not None:
            self.text = self.text.strip()
        self.date = parsed.created
        if self.date is not None and timezone.is_naive(self.date):
            self.date = timezone.make_aware(self.date)
        self.archive_path = self.convert_to_pdf(document_path, file_name)
    def convert_to_pdf(self, document_path: Path, file_name):
        pdf_path = Path(self.tempdir) / "convert.pdf"
        self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
        with (
            GotenbergClient(
                host=settings.TIKA_GOTENBERG_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ) as client,
            client.libre_office.to_pdf() as route,
        ):
            # Set the output format of the resulting PDF
            if settings.OCR_OUTPUT_TYPE in {
                OutputTypeChoices.PDF_A,
                OutputTypeChoices.PDF_A2,
            }:
                route.pdf_format(PdfAFormat.A2b)
            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
                self.log.warning(
                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
                )
                route.pdf_format(PdfAFormat.A2b)
            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
                route.pdf_format(PdfAFormat.A3b)
            route.convert(document_path)
            try:
                response = route.run()
                pdf_path.write_bytes(response.content)
                return pdf_path
            except Exception as err:
                raise ParseError(
                    f"Error while converting document to PDF: {err}",
                ) from err
    def get_settings(self) -> OutputTypeConfig:
        """
        This parser only uses the PDF output type configuration currently
        """
        return OutputTypeConfig()
--- a/src/paperless_tika/signals.py
+++ b/src/paperless_tika/signals.py
@@ -1,15 +1,7 @@
 def get_parser(*args, **kwargs):
-    from paperless.parsers.tika import TikaDocumentParser
+    from paperless_tika.parsers import TikaDocumentParser
-    # TikaDocumentParser accepts logging_group for constructor compatibility but
+    return TikaDocumentParser(*args, **kwargs)
    # does not store or use it (no legacy DocumentParser base class).
    # progress_callback is also not used.  Both may arrive as a positional arg
    # (consumer) or a keyword arg (views); *args absorbs the positional form,
    # kwargs.pop handles the keyword form.  Phase 4 will replace this signal
    # path with the new ParserRegistry so the shim can be removed at that point.
    kwargs.pop("logging_group", None)
    kwargs.pop("progress_callback", None)
    return TikaDocumentParser()
 def tika_consumer_declaration(sender, **kwargs):
--- a/src/paperless_tika/tests/init.py
+++ b/src/paperless_tika/tests/init.py
--- a/src/paperless_tika/tests/conftest.py
+++ b/src/paperless_tika/tests/conftest.py
@@ -0,0 +1,41 @@
 from collections.abc import Generator
 from pathlib import Path
 import pytest
 from paperless_tika.parsers import TikaDocumentParser
@pytest.fixture()
 def tika_parser() -> Generator[TikaDocumentParser, None, None]:
    try:
        parser = TikaDocumentParser(logging_group=None)
        yield parser
    finally:
        # TODO(stumpylog): Cleanup once all parsers are handled
        parser.cleanup()
@pytest.fixture(scope="session")
 def sample_dir() -> Path:
    return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture(scope="session")
 def sample_odt_file(sample_dir: Path) -> Path:
    return sample_dir / "sample.odt"
@pytest.fixture(scope="session")
 def sample_docx_file(sample_dir: Path) -> Path:
    return sample_dir / "sample.docx"
@pytest.fixture(scope="session")
 def sample_doc_file(sample_dir: Path) -> Path:
    return sample_dir / "sample.doc"
@pytest.fixture(scope="session")
 def sample_broken_odt(sample_dir: Path) -> Path:
    return sample_dir / "multi-part-broken.odt"
--- a/src/paperless_tika/tests/samples/multi-part-broken.odt
+++ b/src/paperless_tika/tests/samples/multi-part-broken.odt
--- a/src/paperless_tika/tests/samples/sample.doc
+++ b/src/paperless_tika/tests/samples/sample.doc
--- a/src/paperless_tika/tests/samples/sample.docx
+++ b/src/paperless_tika/tests/samples/sample.docx
--- a/src/paperless_tika/tests/samples/sample.odt
+++ b/src/paperless_tika/tests/samples/sample.odt
--- a/src/paperless/tests/parsers/test_tika_liva.py
+++ b/src/paperless/tests/parsers/test_tika_liva.py
@@ -4,7 +4,7 @@ from pathlib import Path
 import pytest
 from documents.tests.utils import util_call_with_backoff
-from paperless.parsers.tika import TikaDocumentParser
+from paperless_tika.parsers import TikaDocumentParser
@pytest.mark.skipif(
@@ -42,15 +42,14 @@ class TestTikaParserAgainstServer:
        )
        assert (
-            tika_parser.get_text()
+            tika_parser.text
            == "This is an ODT test document, created September 14, 2022"
        )
-        archive = tika_parser.get_archive_path()
+        assert tika_parser.archive_path is not None
-        assert archive is not None
+        assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
        assert b"PDF-" in archive.read_bytes()[:10]
        # TODO: Unsure what can set the Creation-Date field in a document, enable when possible
-        # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
    def test_basic_parse_docx(
        self,
@@ -75,15 +74,14 @@ class TestTikaParserAgainstServer:
        )
        assert (
-            tika_parser.get_text()
+            tika_parser.text
            == "This is an DOCX test document, also made September 14, 2022"
        )
-        archive = tika_parser.get_archive_path()
+        assert tika_parser.archive_path is not None
-        assert archive is not None
+        with Path(tika_parser.archive_path).open("rb") as f:
        with archive.open("rb") as f:
            assert b"PDF-" in f.read()[:10]
-        # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
    def test_basic_parse_doc(
        self,
@@ -104,12 +102,13 @@ class TestTikaParserAgainstServer:
            [sample_doc_file, "application/msword"],
        )
-        text = tika_parser.get_text()
+        assert tika_parser.text is not None
-        assert text is not None
+        assert (
-        assert "This is a test document, saved in the older .doc format" in text
+            "This is a test document, saved in the older .doc format"
-        archive = tika_parser.get_archive_path()
+            in tika_parser.text
-        assert archive is not None
+        )
-        with archive.open("rb") as f:
+        assert tika_parser.archive_path is not None
        with Path(tika_parser.archive_path).open("rb") as f:
            assert b"PDF-" in f.read()[:10]
    def test_tika_fails_multi_part(
@@ -134,7 +133,6 @@ class TestTikaParserAgainstServer:
            [sample_broken_odt, "application/vnd.oasis.opendocument.text"],
        )
-        archive = tika_parser.get_archive_path()
+        assert tika_parser.archive_path is not None
-        assert archive is not None
+        with Path(tika_parser.archive_path).open("rb") as f:
        with archive.open("rb") as f:
            assert b"PDF-" in f.read()[:10]
--- a/src/paperless/tests/parsers/test_tika_parser.py
+++ b/src/paperless/tests/parsers/test_tika_parser.py
@@ -9,56 +9,7 @@ from pytest_django.fixtures import SettingsWrapper
 from pytest_httpx import HTTPXMock
 from documents.parsers import ParseError
-from paperless.parsers import ParserProtocol
+from paperless_tika.parsers import TikaDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
 class TestTikaParserRegistryInterface:
    """Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
    def test_satisfies_parser_protocol(self) -> None:
        assert isinstance(TikaDocumentParser(), ParserProtocol)
    def test_supported_mime_types_is_classmethod(self) -> None:
        mime_types = TikaDocumentParser.supported_mime_types()
        assert isinstance(mime_types, dict)
        assert len(mime_types) > 0
    def test_score_returns_none_when_tika_disabled(
        self,
        settings: SettingsWrapper,
    ) -> None:
        settings.TIKA_ENABLED = False
        result = TikaDocumentParser.score(
            "application/vnd.oasis.opendocument.text",
            "sample.odt",
        )
        assert result is None
    def test_score_returns_int_when_tika_enabled(
        self,
        settings: SettingsWrapper,
    ) -> None:
        settings.TIKA_ENABLED = True
        result = TikaDocumentParser.score(
            "application/vnd.oasis.opendocument.text",
            "sample.odt",
        )
        assert isinstance(result, int)
    def test_score_returns_none_for_unsupported_mime(
        self,
        settings: SettingsWrapper,
    ) -> None:
        settings.TIKA_ENABLED = True
        result = TikaDocumentParser.score("application/pdf", "doc.pdf")
        assert result is None
    def test_can_produce_archive_is_false(self) -> None:
        assert TikaDocumentParser().can_produce_archive is False
    def test_requires_pdf_rendition_is_true(self) -> None:
        assert TikaDocumentParser().requires_pdf_rendition is True
@pytest.mark.django_db()
@@ -85,12 +36,12 @@ class TestTikaParser:
        tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
-        assert tika_parser.get_text() == "the content"
+        assert tika_parser.text == "the content"
-        assert tika_parser.get_archive_path() is not None
+        assert tika_parser.archive_path is not None
-        with Path(tika_parser.get_archive_path()).open("rb") as f:
+        with Path(tika_parser.archive_path).open("rb") as f:
            assert f.read() == b"PDF document"
-        assert tika_parser.get_date() == datetime.datetime(
+        assert tika_parser.date == datetime.datetime(
            2020,
            11,
            21,
@@ -138,7 +89,7 @@ class TestTikaParser:
        httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
        with pytest.raises(ParseError):
-            tika_parser._convert_to_pdf(sample_odt_file)
+            tika_parser.convert_to_pdf(sample_odt_file, None)
    @pytest.mark.parametrize(
        ("setting_value", "expected_form_value"),
@@ -155,6 +106,7 @@ class TestTikaParser:
        expected_form_value: str,
        httpx_mock: HTTPXMock,
        settings: SettingsWrapper,
        tika_parser: TikaDocumentParser,
        sample_odt_file: Path,
    ) -> None:
        """
@@ -165,8 +117,6 @@ class TestTikaParser:
        THEN:
            - Request to Gotenberg contains the expected PDF/A format string
        """
        # Parser must be created after the setting is changed so that
        # OutputTypeConfig reads the correct value at __init__ time.
        settings.OCR_OUTPUT_TYPE = setting_value
        httpx_mock.add_response(
            status_code=codes.OK,
@@ -174,8 +124,7 @@ class TestTikaParser:
            method="POST",
        )
-        with TikaDocumentParser() as parser:
+        tika_parser.convert_to_pdf(sample_odt_file, None)
            parser._convert_to_pdf(sample_odt_file)
        request = httpx_mock.get_request()
Author	SHA1	Message	Date
Trenton H	2098a11eb1	Fix: text parser get_parser forwards logging_group, drops progress_callback TextDocumentParser.__init__ accepts logging_group: object = None, same as RemoteDocumentParser. The old shim incorrectly dropped it; fix to forward it as a positional arg and only drop progress_callback. Add type annotations and from __future__ import annotations for consistency with the remote parser signals shim. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 12:36:24 -07:00
Trenton H	af8a8e791b	Fix: get_parser factory forwards logging_group, drops progress_callback consumer.py calls parser_class(logging_group, progress_callback=...). RemoteDocumentParser.__init__ accepts logging_group but not progress_callback, so only the latter is dropped — matching the pattern established by the TextDocumentParser signals shim. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 12:35:16 -07:00
Trenton H	8d4163bef3	Refactor: fix type errors in remote parser and signals - remote.py: add `if TYPE_CHECKING: assert` guards before the Azure client construction to narrow config.endpoint and config.api_key from str\|None to str. The narrowing is safe: engine_is_valid() guarantees both are non-None when it returns True (api_key explicitly; endpoint via `not (engine=="azureai" and endpoint is None)` for the only valid engine). Asserts are wrapped in TYPE_CHECKING so they carry zero runtime cost. - signals.py: add full type annotations — return types, Any-typed sender parameter, and explicit logging_group argument replacing *args. Add `from __future__ import annotations` for consistent annotation style. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 12:31:17 -07:00
Trenton H	e9e1d4ccca	Refactor: wire RemoteDocumentParser into consumer and fix signals - paperless_remote/signals.py: import from paperless.parsers.remote (new location after git mv). supported_mime_types() is now a classmethod that always returns the full set, so get_supported_mime_types() in the signal layer explicitly checks RemoteEngineConfig validity and returns {} when unconfigured — preserving the old behaviour where an unconfigured remote parser does not register for any MIME types. - documents/consumer.py: extend the _parser_cleanup() shim, parse() dispatch, and get_thumbnail() dispatch to include RemoteDocumentParser alongside TextDocumentParser. Both new-style parsers use __exit__ for cleanup and take (document_path, mime_type) without a file_name argument. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 12:09:33 -07:00
Trenton H	c955ba7d07	Refactor: improve remote parser test fixture structure - make_azure_mock moved from conftest.py back into test_remote_parser.py; it is specific to that module and does not belong in shared fixtures - azure_client fixture composes azure_settings + make_azure_mock + patch in one step; tests no longer repeat the mocker.patch call or carry an unused azure_settings parameter - failing_azure_client fixture similarly composes azure_settings + patch with a RuntimeError side effect; TestRemoteParserParseError now only receives the mock it actually uses - All @pytest.mark.parametrize calls use pytest.param with explicit ids (pdf, png, jpeg, ...) for readable test output Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 12:00:37 -07:00
Trenton H	7028bb2163	Refactor: use fixture factory and usefixtures in remote parser tests - `_make_azure_mock` helper promoted to `make_azure_mock` factory fixture in conftest.py; tests call `make_azure_mock()` or `make_azure_mock("custom text")` instead of a module-level function - `azure_settings` and `no_engine_settings` applied via `@pytest.mark.usefixtures` wherever their value is not referenced inside the test body; `TestRemoteParserParseError` marked at the class level since all three tests need the same setting Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 11:56:38 -07:00
Trenton H	5d4d87764c	Feature: migrate RemoteDocumentParser to ParserProtocol interface Rewrites the remote OCR parser to the new plugin system contract: - `supported_mime_types()` is now a classmethod that always returns the full set of 7 MIME types; the old instance-method hack (returning {} when unconfigured) is removed - `score()` classmethod returns None when no remote engine is configured (making the parser invisible to the registry), and 20 when active — higher than the tesseract default of 10 so the remote engine takes priority when both are available - No longer inherits from RasterisedDocumentParser; inherits no parser class at all — just implements the protocol directly - `can_produce_archive = True`; `requires_pdf_rendition = False` - `_azure_ai_vision_parse()` takes explicit config arg; API client created and closed within the method - `get_page_count()` returns the PDF page count for application/pdf, delegating to the new `get_page_count_for_pdf()` utility - `extract_metadata()` delegates to `extract_pdf_metadata()` for PDFs; returns [] for all other MIME types New files: - `src/paperless/parsers/utils.py` — shared `extract_pdf_metadata()` and `get_page_count_for_pdf()` utilities (pikepdf-based); both the remote and tesseract parsers will use these going forward - `src/paperless/tests/parsers/test_remote_parser.py` — 42 pytest-style tests using pytest-django `settings` and pytest-mock `mocker` fixtures - `src/paperless/tests/parsers/conftest.py` — remote parser instance, sample-file, and settings-helper fixtures Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 11:52:11 -07:00
Trenton H	75dce7f19f	Refactor: move remote parser, test, and sample to paperless.parsers Relocates three files to their new homes in the parser plugin system: - src/paperless_remote/parsers.py → src/paperless/parsers/remote.py - src/paperless_remote/tests/test_parser.py → src/paperless/tests/parsers/test_remote_parser.py - src/paperless_remote/tests/samples/simple-digital.pdf → src/paperless/tests/samples/remote/simple-digital.pdf Content and imports will be updated in the follow-up commit that rewrites the parser to the new ParserProtocol interface. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 11:32:34 -07:00