From 2b3361726209449e3b044ea01f971bec69049dce Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 12 Mar 2026 15:30:59 -0700
Subject: [PATCH] =?UTF-8?q?Feature:=20Phase=203=20=E2=80=94=20migrate=20Ti?=
 =?UTF-8?q?kaDocumentParser=20to=20ParserProtocol?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor TikaDocumentParser to satisfy ParserProtocol without subclassing
the legacy DocumentParser ABC:

- Add ClassVars: name, version, author, url
- Add supported_mime_types() classmethod (12 Office/ODF/RTF MIME types)
- Add score() classmethod — returns None when TIKA_ENABLED is False, 10 otherwise
- can_produce_archive = False (PDF is for display, not an OCR archive)
- requires_pdf_rendition = True (Office formats need PDF for browser display)
- __enter__/__exit__ via ExitStack: TikaClient opened once per parser
  lifetime and shared across parse() and extract_metadata() calls
- extract_metadata() falls back to a short-lived TikaClient when called
  outside a context manager (legacy view-layer metadata path)
- _convert_to_pdf() uses OutputTypeConfig() to honour the database-stored
  ApplicationConfiguration before falling back to the env-var setting
- Rename convert_to_pdf → _convert_to_pdf (private helper)

Update paperless_tika/signals.py shim to import from the new module path
and drop the legacy logging_group/progress_callback kwargs.

Update documents/consumer.py to extend the existing TextDocumentParser
special cases to also cover TikaDocumentParser (parse/get_thumbnail
signatures, __exit__ cleanup).

Add TestTikaParserRegistryInterface (7 tests) covering score(), properties,
and ParserProtocol isinstance check.  Update existing tests to use the new
accessor API (get_text, get_date, get_archive_path, _convert_to_pdf).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/documents/consumer.py                     |   7 +-
 src/paperless/parsers/tika.py                 | 453 +++++++++++++++---
 .../tests/parsers/test_tika_parser.py         |  67 ++-
 src/paperless_tika/signals.py                 |  10 +-
 4 files changed, 451 insertions(+), 86 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index cbc2198ef..fadd9a4e6 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -52,6 +52,7 @@ from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
 from paperless.parsers.text import TextDocumentParser
+from paperless.parsers.tika import TikaDocumentParser
 from paperless_mail.parsers import MailDocumentParser
 
 LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -67,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
 
     TODO(stumpylog): Remove me in the future
     """
-    if isinstance(parser, TextDocumentParser):
+    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
         parser.__exit__(None, None, None)
     else:
         parser.cleanup()
@@ -476,7 +477,7 @@ class ConsumerPlugin(
                     self.filename,
                     self.input_doc.mailrule_id,
                 )
-            elif isinstance(document_parser, TextDocumentParser):
+            elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
                 # TODO(stumpylog): Remove me in the future
                 document_parser.parse(self.working_copy, mime_type)
             else:
@@ -489,7 +490,7 @@ class ConsumerPlugin(
                 ProgressStatusOptions.WORKING,
                 ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
             )
-            if isinstance(document_parser, TextDocumentParser):
+            if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
                 # TODO(stumpylog): Remove me in the future
                 thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
             else:
diff --git a/src/paperless/parsers/tika.py b/src/paperless/parsers/tika.py
index 22a5bc1c6..467173e85 100644
--- a/src/paperless/parsers/tika.py
+++ b/src/paperless/parsers/tika.py
@@ -1,4 +1,21 @@
+"""
+Built-in Tika document parser.
+
+Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
+sending them to an Apache Tika server for text extraction and a Gotenberg
+server for PDF conversion.  Because the source formats cannot be rendered by
+a browser natively, the parser always produces a PDF rendition for display.
+"""
+
+from __future__ import annotations
+
+import logging
+import shutil
+import tempfile
+from contextlib import ExitStack
 from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Self
 
 import httpx
 from django.conf import settings
@@ -7,92 +24,388 @@ from gotenberg_client import GotenbergClient
 from gotenberg_client.options import PdfAFormat
 from tika_client import TikaClient
 
-from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from paperless.config import OutputTypeConfig
 from paperless.models import OutputTypeChoices
+from paperless.version import __full_version_str__
+
+if TYPE_CHECKING:
+    import datetime
+    from types import TracebackType
+
+    from paperless.parsers import MetadataEntry
+
+logger = logging.getLogger("paperless.parsing.tika")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+    "application/msword": ".doc",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+    "application/vnd.ms-excel": ".xls",
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
+    "application/vnd.ms-powerpoint": ".ppt",
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
+    "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
+    "application/vnd.oasis.opendocument.presentation": ".odp",
+    "application/vnd.oasis.opendocument.spreadsheet": ".ods",
+    "application/vnd.oasis.opendocument.text": ".odt",
+    "application/vnd.oasis.opendocument.graphics": ".odg",
+    "text/rtf": ".rtf",
+}
 
 
-class TikaDocumentParser(DocumentParser):
-    """
-    This parser sends documents to a local tika server
+class TikaDocumentParser:
+    """Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
+
+    Text extraction is handled by the Tika server.  PDF conversion for display
+    is handled by Gotenberg (LibreOffice route).  Because the source formats
+    cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
+    True and the PDF is always produced regardless of the ``produce_archive``
+    flag passed to ``parse``.
+
+    The underlying ``TikaClient`` HTTP connection is opened once in
+    ``__enter__`` via an ``ExitStack`` and shared across ``parse`` and
+    ``extract_metadata`` calls, then closed in ``__exit__``.  When the parser
+    is used without a context manager (e.g. the legacy view-layer metadata
+    path), ``extract_metadata`` falls back to creating a short-lived client
+    for that call only.
+
+    Class attributes
+    ----------------
+    name : str
+        Human-readable parser name.
+    version : str
+        Semantic version string, kept in sync with Paperless-ngx releases.
+    author : str
+        Maintainer name.
+    url : str
+        Issue tracker / source URL.
     """
 
-    logging_name = "paperless.parsing.tika"
+    name: str = "Paperless-ngx Tika Parser"
+    version: str = __full_version_str__
+    author: str = "Paperless-ngx Contributors"
+    url: str = "https://github.com/paperless-ngx/paperless-ngx"
 
-    def get_thumbnail(self, document_path, mime_type, file_name=None):
-        if not self.archive_path:
-            self.archive_path = self.convert_to_pdf(document_path, file_name)
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------
 
-        return make_thumbnail_from_pdf(
-            self.archive_path,
-            self.tempdir,
-            self.logging_group,
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        """Return the MIME types this parser handles.
+
+        Returns
+        -------
+        dict[str, str]
+            Mapping of MIME type to preferred file extension.
+        """
+        return _SUPPORTED_MIME_TYPES
+
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        """Return the priority score for handling this file.
+
+        Returns ``None`` when Tika integration is disabled so the registry
+        skips this parser entirely.
+
+        Parameters
+        ----------
+        mime_type:
+            Detected MIME type of the file.
+        filename:
+            Original filename including extension.
+        path:
+            Optional filesystem path. Not inspected by this parser.
+
+        Returns
+        -------
+        int | None
+            10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
+        """
+        if not settings.TIKA_ENABLED:
+            return None
+        if mime_type in _SUPPORTED_MIME_TYPES:
+            return 10
+        return None
+
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def can_produce_archive(self) -> bool:
+        """Whether this parser can produce a searchable PDF archive copy.
+
+        Returns
+        -------
+        bool
+            Always False — Tika produces a display PDF, not an OCR archive.
+        """
+        return False
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        """Whether the parser must produce a PDF for the frontend to display.
+
+        Returns
+        -------
+        bool
+            Always True — Office formats cannot be rendered natively in a
+            browser, so a PDF conversion is always required for display.
+        """
+        return True
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def __init__(self, logging_group: object = None) -> None:
+        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+        self._tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
         )
+        self._text: str | None = None
+        self._date: datetime.datetime | None = None
+        self._archive_path: Path | None = None
+        self._exit_stack = ExitStack()
+        self._tika_client: TikaClient | None = None
 
-    def extract_metadata(self, document_path, mime_type):
-        try:
-            with TikaClient(
+    def __enter__(self) -> Self:
+        self._tika_client = self._exit_stack.enter_context(
+            TikaClient(
                 tika_url=settings.TIKA_ENDPOINT,
                 timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client:
-                parsed = client.metadata.from_file(document_path, mime_type)
-                return [
-                    {
-                        "namespace": "",
-                        "prefix": "",
-                        "key": key,
-                        "value": parsed.data[key],
-                    }
-                    for key in parsed.data
-                ]
-        except Exception as e:
-            self.log.warning(
-                f"Error while fetching document metadata for {document_path}: {e}",
-            )
-            return []
+            ),
+        )
+        return self
 
-    def parse(self, document_path: Path, mime_type: str, file_name=None) -> None:
-        self.log.info(f"Sending {document_path} to Tika server")
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        self._exit_stack.__exit__(exc_type, exc_val, exc_tb)
+        logger.debug("Cleaning up temporary directory %s", self._tempdir)
+        shutil.rmtree(self._tempdir, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
+        """Send the document to Tika for text extraction and Gotenberg for PDF.
+
+        Because ``requires_pdf_rendition`` is True the PDF conversion is
+        always performed — the ``produce_archive`` flag is intentionally
+        ignored.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the document file to parse.
+        mime_type:
+            Detected MIME type of the document.
+        produce_archive:
+            Accepted for protocol compatibility but ignored; the PDF rendition
+            is always produced since the source format cannot be displayed
+            natively in the browser.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If Tika or Gotenberg returns an error.
+        """
+        logger.info("Sending %s to Tika server", document_path)
 
         try:
-            with TikaClient(
-                tika_url=settings.TIKA_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client:
-                try:
-                    parsed = client.tika.as_text.from_file(document_path, mime_type)
-                except httpx.HTTPStatusError as err:
-                    # Workaround https://issues.apache.org/jira/browse/TIKA-4110
-                    # Tika fails with some files as multi-part form data
-                    if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
-                        parsed = client.tika.as_text.from_buffer(
-                            document_path.read_bytes(),
-                            mime_type,
-                        )
-                    else:  # pragma: no cover
-                        raise
+            try:
+                parsed = self._tika_client.tika.as_text.from_file(
+                    document_path,
+                    mime_type,
+                )
+            except httpx.HTTPStatusError as err:
+                # Workaround https://issues.apache.org/jira/browse/TIKA-4110
+                # Tika fails with some files as multi-part form data
+                if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
+                    parsed = self._tika_client.tika.as_text.from_buffer(
+                        document_path.read_bytes(),
+                        mime_type,
+                    )
+                else:  # pragma: no cover
+                    raise
         except Exception as err:
             raise ParseError(
                 f"Could not parse {document_path} with tika server at "
                 f"{settings.TIKA_ENDPOINT}: {err}",
             ) from err
 
-        self.text = parsed.content
-        if self.text is not None:
-            self.text = self.text.strip()
+        self._text = parsed.content
+        if self._text is not None:
+            self._text = self._text.strip()
 
-        self.date = parsed.created
-        if self.date is not None and timezone.is_naive(self.date):
-            self.date = timezone.make_aware(self.date)
+        self._date = parsed.created
+        if self._date is not None and timezone.is_naive(self._date):
+            self._date = timezone.make_aware(self._date)
 
-        self.archive_path = self.convert_to_pdf(document_path, file_name)
+        # Always convert — requires_pdf_rendition=True means the browser
+        # cannot display the source format natively.
+        self._archive_path = self._convert_to_pdf(document_path)
 
-    def convert_to_pdf(self, document_path: Path, file_name):
-        pdf_path = Path(self.tempdir) / "convert.pdf"
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
 
-        self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
+    def get_text(self) -> str | None:
+        """Return the plain-text content extracted during parse.
+
+        Returns
+        -------
+        str | None
+            Extracted text, or None if parse has not been called yet.
+        """
+        return self._text
+
+    def get_date(self) -> datetime.datetime | None:
+        """Return the document date detected during parse.
+
+        Returns
+        -------
+        datetime.datetime | None
+            Creation date from Tika metadata, or None if not detected.
+        """
+        return self._date
+
+    def get_archive_path(self) -> Path | None:
+        """Return the path to the generated PDF rendition, or None.
+
+        Returns
+        -------
+        Path | None
+            Path to the PDF produced by Gotenberg, or None if parse has not
+            been called yet.
+        """
+        return self._archive_path
+
+    # ------------------------------------------------------------------
+    # Thumbnail and metadata
+    # ------------------------------------------------------------------
+
+    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
+        """Generate a thumbnail from the PDF rendition of the document.
+
+        Converts the document to PDF first if not already done.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        Path
+            Path to the generated WebP thumbnail inside the temporary directory.
+        """
+        if self._archive_path is None:
+            self._archive_path = self._convert_to_pdf(document_path)
+        return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
+
+    def get_page_count(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> int | None:
+        """Return the number of pages in the document.
+
+        Returns
+        -------
+        int | None
+            Always None — page count is not available from Tika.
+        """
+        return None
+
+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract format-specific metadata via the Tika metadata endpoint.
+
+        When the parser is used as a context manager, the shared
+        ``TikaClient`` opened in ``__enter__`` is reused.  When called
+        outside a context manager (e.g. the legacy view-layer metadata path),
+        a short-lived ``TikaClient`` is created for this call only.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            All key/value pairs returned by Tika, or ``[]`` on error.
+        """
+        try:
+            if self._tika_client is not None:
+                parsed = self._tika_client.metadata.from_file(document_path, mime_type)
+            else:
+                with TikaClient(
+                    tika_url=settings.TIKA_ENDPOINT,
+                    timeout=settings.CELERY_TASK_TIME_LIMIT,
+                ) as client:
+                    parsed = client.metadata.from_file(document_path, mime_type)
+            return [
+                {
+                    "namespace": "",
+                    "prefix": "",
+                    "key": key,
+                    "value": parsed.data[key],
+                }
+                for key in parsed.data
+            ]
+        except Exception as e:
+            logger.warning(
+                "Error while fetching document metadata for %s: %s",
+                document_path,
+                e,
+            )
+            return []
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    def _convert_to_pdf(self, document_path: Path) -> Path:
+        """Convert the document to PDF using Gotenberg's LibreOffice route.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+
+        Returns
+        -------
+        Path
+            Path to the generated PDF inside the temporary directory.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If Gotenberg returns an error.
+        """
+        pdf_path = self._tempdir / "convert.pdf"
+
+        logger.info("Converting %s to PDF as %s", document_path, pdf_path)
 
         with (
             GotenbergClient(
@@ -101,36 +414,30 @@ class TikaDocumentParser(DocumentParser):
             ) as client,
             client.libre_office.to_pdf() as route,
         ):
-            # Set the output format of the resulting PDF
-            if settings.OCR_OUTPUT_TYPE in {
+            # Set the output format of the resulting PDF.
+            # OutputTypeConfig reads the database-stored ApplicationConfiguration
+            # first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
+            output_type = OutputTypeConfig().output_type
+            if output_type in {
                 OutputTypeChoices.PDF_A,
                 OutputTypeChoices.PDF_A2,
             }:
                 route.pdf_format(PdfAFormat.A2b)
-            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
-                self.log.warning(
+            elif output_type == OutputTypeChoices.PDF_A1:
+                logger.warning(
                     "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
                 )
                 route.pdf_format(PdfAFormat.A2b)
-            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
+            elif output_type == OutputTypeChoices.PDF_A3:
                 route.pdf_format(PdfAFormat.A3b)
 
             route.convert(document_path)
 
             try:
                 response = route.run()
-
                 pdf_path.write_bytes(response.content)
-
                 return pdf_path
-
             except Exception as err:
                 raise ParseError(
                     f"Error while converting document to PDF: {err}",
                 ) from err
-
-    def get_settings(self) -> OutputTypeConfig:
-        """
-        This parser only uses the PDF output type configuration currently
-        """
-        return OutputTypeConfig()
diff --git a/src/paperless/tests/parsers/test_tika_parser.py b/src/paperless/tests/parsers/test_tika_parser.py
index 2cf39da59..d18d178e6 100644
--- a/src/paperless/tests/parsers/test_tika_parser.py
+++ b/src/paperless/tests/parsers/test_tika_parser.py
@@ -5,11 +5,60 @@ from pathlib import Path
 
 import pytest
 from httpx import codes
-from paperless_tika.parsers import TikaDocumentParser
 from pytest_django.fixtures import SettingsWrapper
 from pytest_httpx import HTTPXMock
 
 from documents.parsers import ParseError
+from paperless.parsers import ParserProtocol
+from paperless.parsers.tika import TikaDocumentParser
+
+
+class TestTikaParserRegistryInterface:
+    """Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
+
+    def test_satisfies_parser_protocol(self) -> None:
+        assert isinstance(TikaDocumentParser(), ParserProtocol)
+
+    def test_supported_mime_types_is_classmethod(self) -> None:
+        mime_types = TikaDocumentParser.supported_mime_types()
+        assert isinstance(mime_types, dict)
+        assert len(mime_types) > 0
+
+    def test_score_returns_none_when_tika_disabled(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.TIKA_ENABLED = False
+        result = TikaDocumentParser.score(
+            "application/vnd.oasis.opendocument.text",
+            "sample.odt",
+        )
+        assert result is None
+
+    def test_score_returns_int_when_tika_enabled(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.TIKA_ENABLED = True
+        result = TikaDocumentParser.score(
+            "application/vnd.oasis.opendocument.text",
+            "sample.odt",
+        )
+        assert isinstance(result, int)
+
+    def test_score_returns_none_for_unsupported_mime(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.TIKA_ENABLED = True
+        result = TikaDocumentParser.score("application/pdf", "doc.pdf")
+        assert result is None
+
+    def test_can_produce_archive_is_false(self) -> None:
+        assert TikaDocumentParser().can_produce_archive is False
+
+    def test_requires_pdf_rendition_is_true(self) -> None:
+        assert TikaDocumentParser().requires_pdf_rendition is True
 
 
 @pytest.mark.django_db()
@@ -36,12 +85,12 @@ class TestTikaParser:
 
         tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
 
-        assert tika_parser.text == "the content"
-        assert tika_parser.archive_path is not None
-        with Path(tika_parser.archive_path).open("rb") as f:
+        assert tika_parser.get_text() == "the content"
+        assert tika_parser.get_archive_path() is not None
+        with Path(tika_parser.get_archive_path()).open("rb") as f:
             assert f.read() == b"PDF document"
 
-        assert tika_parser.date == datetime.datetime(
+        assert tika_parser.get_date() == datetime.datetime(
             2020,
             11,
             21,
@@ -89,7 +138,7 @@ class TestTikaParser:
         httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
 
         with pytest.raises(ParseError):
-            tika_parser.convert_to_pdf(sample_odt_file, None)
+            tika_parser._convert_to_pdf(sample_odt_file)
 
     @pytest.mark.parametrize(
         ("setting_value", "expected_form_value"),
@@ -106,7 +155,6 @@ class TestTikaParser:
         expected_form_value: str,
         httpx_mock: HTTPXMock,
         settings: SettingsWrapper,
-        tika_parser: TikaDocumentParser,
         sample_odt_file: Path,
     ) -> None:
         """
@@ -117,6 +165,8 @@ class TestTikaParser:
         THEN:
             - Request to Gotenberg contains the expected PDF/A format string
         """
+        # Parser must be created after the setting is changed so that
+        # OutputTypeConfig reads the correct value at __init__ time.
         settings.OCR_OUTPUT_TYPE = setting_value
         httpx_mock.add_response(
             status_code=codes.OK,
@@ -124,7 +174,8 @@ class TestTikaParser:
             method="POST",
         )
 
-        tika_parser.convert_to_pdf(sample_odt_file, None)
+        with TikaDocumentParser() as parser:
+            parser._convert_to_pdf(sample_odt_file)
 
         request = httpx_mock.get_request()
 
diff --git a/src/paperless_tika/signals.py b/src/paperless_tika/signals.py
index 2b7495e35..412b80bb7 100644
--- a/src/paperless_tika/signals.py
+++ b/src/paperless_tika/signals.py
@@ -1,7 +1,13 @@
 def get_parser(*args, **kwargs):
-    from paperless_tika.parsers import TikaDocumentParser
+    from paperless.parsers.tika import TikaDocumentParser
 
-    return TikaDocumentParser(*args, **kwargs)
+    # The new TikaDocumentParser does not accept the legacy logging_group /
+    # progress_callback kwargs injected by the old signal-based consumer.
+    # These are dropped here; Phase 4 will replace this signal path with the
+    # new ParserRegistry so the shim can be removed at that point.
+    kwargs.pop("logging_group", None)
+    kwargs.pop("progress_callback", None)
+    return TikaDocumentParser()
 
 
 def tika_consumer_declaration(sender, **kwargs):