Chore(deps-dev): Bump zensical in the development group

Bumps the development group with 1 update: [zensical](https://github.com/zensical/zensical). Updates `zensical` from 0.0.24 to 0.0.25 - [Release notes](https://github.com/zensical/zensical/releases) - [Commits](https://github.com/zensical/zensical/compare/v0.0.24...v0.0.25) --- updated-dependencies: - dependency-name: zensical dependency-version: 0.0.25 dependency-type: direct:development update-type: version-update:semver-patch dependency-group: development ... Signed-off-by: dependabot[bot] <support@github.com>
Feature: Convert Tika parser to the plugin system (#12333 )
2026-03-17 22:45:58 +00:00 · 2026-03-17 22:44:48 +00:00 · 2026-03-17 15:43:28 -07:00
20 changed files with 652 additions and 228 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -157,6 +157,9 @@ updates:
      postgres:
        patterns:
          - "docker.io/library/postgres*"
+      greenmail:
+        patterns:
+          - "docker.io/greenmail*"
  - package-ecosystem: "pre-commit" # See documentation for possible values
    directory: "/" # Location of package manifests
    schedule:
--- a/docker/compose/docker-compose.ci-test.yml
+++ b/docker/compose/docker-compose.ci-test.yml
@@ -18,13 +18,13 @@ services:
      - "--log-level=warn"
      - "--log-format=text"
  tika:
-    image: docker.io/apache/tika:latest
+    image: docker.io/apache/tika:3.2.3.0
    hostname: tika
    container_name: tika
    network_mode: host
    restart: unless-stopped
  greenmail:
-    image: greenmail/standalone:2.1.8
+    image: docker.io/greenmail/standalone:2.1.8
    hostname: greenmail
    container_name: greenmail
    environment:
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -52,6 +52,7 @@ from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
 from paperless.parsers.text import TextDocumentParser
+from paperless.parsers.tika import TikaDocumentParser
 from paperless_mail.parsers import MailDocumentParser

 LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -67,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:

    TODO(stumpylog): Remove me in the future
    """
-    if isinstance(parser, TextDocumentParser):
+    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
        parser.__exit__(None, None, None)
    else:
        parser.cleanup()
@@ -448,6 +449,12 @@ class ConsumerPlugin(
            progress_callback=progress_callback,
        )

+        # New-style parsers use __enter__/__exit__ for resource management.
+        # _parser_cleanup (below) handles __exit__; call __enter__ here.
+        # TODO(stumpylog): Remove me in the future
+        if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+            document_parser.__enter__()
+
        self.log.debug(f"Parser: {type(document_parser).__name__}")

        # Parse the document. This may take some time.
@@ -476,7 +483,7 @@ class ConsumerPlugin(
                    self.filename,
                    self.input_doc.mailrule_id,
                )
-            elif isinstance(document_parser, TextDocumentParser):
+            elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
                # TODO(stumpylog): Remove me in the future
                document_parser.parse(self.working_copy, mime_type)
            else:
@@ -489,7 +496,7 @@ class ConsumerPlugin(
                ProgressStatusOptions.WORKING,
                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
            )
-            if isinstance(document_parser, TextDocumentParser):
+            if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
                # TODO(stumpylog): Remove me in the future
                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
            else:
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -10,8 +10,8 @@ from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import get_supported_file_extensions
 from documents.parsers import is_file_ext_supported
 from paperless.parsers.text import TextDocumentParser
+from paperless.parsers.tika import TikaDocumentParser
 from paperless_tesseract.parsers import RasterisedDocumentParser
-from paperless_tika.parsers import TikaDocumentParser


 class TestParserDiscovery(TestCase):
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -7,6 +7,7 @@ import tempfile
 import zipfile
 from collections import defaultdict
 from collections import deque
+from contextlib import nullcontext
 from datetime import datetime
 from pathlib import Path
 from time import mktime
@@ -225,6 +226,7 @@ from paperless.celery import app as celery_app
 from paperless.config import AIConfig
 from paperless.config import GeneralConfig
 from paperless.models import ApplicationConfiguration
+from paperless.parsers import ParserProtocol
 from paperless.serialisers import GroupSerializer
 from paperless.serialisers import UserSerializer
 from paperless.views import StandardPagination
@@ -1084,9 +1086,11 @@ class DocumentViewSet(
        parser_class = get_parser_class_for_mime_type(mime_type)
        if parser_class:
            parser = parser_class(progress_callback=None, logging_group=None)
+            cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)

            try:
-                return parser.extract_metadata(file, mime_type)
+                with cm:
+                    return parser.extract_metadata(file, mime_type)
            except Exception:  # pragma: no cover
                logger.exception(f"Issue getting metadata for {file}")
                # TODO: cover GPG errors, remove later.
--- a/src/paperless/parsers/registry.py
+++ b/src/paperless/parsers/registry.py
@@ -194,8 +194,10 @@ class ParserRegistry:
        at runtime regardless of registration order.
        """
        from paperless.parsers.text import TextDocumentParser
+        from paperless.parsers.tika import TikaDocumentParser

        self.register_builtin(TextDocumentParser)
+        self.register_builtin(TikaDocumentParser)

    # ------------------------------------------------------------------
    # Discovery
--- a/src/paperless/parsers/tika.py
+++ b/src/paperless/parsers/tika.py
@@ -0,0 +1,440 @@
+"""
+Built-in Tika document parser.
+
+Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
+sending them to an Apache Tika server for text extraction and a Gotenberg
+server for PDF conversion.  Because the source formats cannot be rendered by
+a browser natively, the parser always produces a PDF rendition for display.
+"""
+
+from __future__ import annotations
+
+import logging
+import shutil
+import tempfile
+from contextlib import ExitStack
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Self
+
+import httpx
+from django.conf import settings
+from django.utils import timezone
+from gotenberg_client import GotenbergClient
+from gotenberg_client.options import PdfAFormat
+from tika_client import TikaClient
+
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from paperless.config import OutputTypeConfig
+from paperless.models import OutputTypeChoices
+from paperless.version import __full_version_str__
+
+if TYPE_CHECKING:
+    import datetime
+    from types import TracebackType
+
+    from paperless.parsers import MetadataEntry
+
+logger = logging.getLogger("paperless.parsing.tika")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+    "application/msword": ".doc",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+    "application/vnd.ms-excel": ".xls",
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
+    "application/vnd.ms-powerpoint": ".ppt",
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
+    "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
+    "application/vnd.oasis.opendocument.presentation": ".odp",
+    "application/vnd.oasis.opendocument.spreadsheet": ".ods",
+    "application/vnd.oasis.opendocument.text": ".odt",
+    "application/vnd.oasis.opendocument.graphics": ".odg",
+    "text/rtf": ".rtf",
+}
+
+
+class TikaDocumentParser:
+    """Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
+
+    Text extraction is handled by the Tika server.  PDF conversion for display
+    is handled by Gotenberg (LibreOffice route).  Because the source formats
+    cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
+    True and the PDF is always produced regardless of the ``produce_archive``
+    flag passed to ``parse``.
+
+    Both ``TikaClient`` and ``GotenbergClient`` are opened once in
+    ``__enter__`` via an ``ExitStack`` and shared across ``parse``,
+    ``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
+    ``ExitStack.close()`` in ``__exit__``.  The parser must always be used
+    as a context manager.
+
+    Class attributes
+    ----------------
+    name : str
+        Human-readable parser name.
+    version : str
+        Semantic version string, kept in sync with Paperless-ngx releases.
+    author : str
+        Maintainer name.
+    url : str
+        Issue tracker / source URL.
+    """
+
+    name: str = "Paperless-ngx Tika Parser"
+    version: str = __full_version_str__
+    author: str = "Paperless-ngx Contributors"
+    url: str = "https://github.com/paperless-ngx/paperless-ngx"
+
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        """Return the MIME types this parser handles.
+
+        Returns
+        -------
+        dict[str, str]
+            Mapping of MIME type to preferred file extension.
+        """
+        return _SUPPORTED_MIME_TYPES
+
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        """Return the priority score for handling this file.
+
+        Returns ``None`` when Tika integration is disabled so the registry
+        skips this parser entirely.
+
+        Parameters
+        ----------
+        mime_type:
+            Detected MIME type of the file.
+        filename:
+            Original filename including extension.
+        path:
+            Optional filesystem path. Not inspected by this parser.
+
+        Returns
+        -------
+        int | None
+            10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
+        """
+        if not settings.TIKA_ENABLED:
+            return None
+        if mime_type in _SUPPORTED_MIME_TYPES:
+            return 10
+        return None
+
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def can_produce_archive(self) -> bool:
+        """Whether this parser can produce a searchable PDF archive copy.
+
+        Returns
+        -------
+        bool
+            Always False — Tika produces a display PDF, not an OCR archive.
+        """
+        return False
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        """Whether the parser must produce a PDF for the frontend to display.
+
+        Returns
+        -------
+        bool
+            Always True — Office formats cannot be rendered natively in a
+            browser, so a PDF conversion is always required for display.
+        """
+        return True
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def __init__(self, logging_group: object = None) -> None:
+        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+        self._tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
+        self._text: str | None = None
+        self._date: datetime.datetime | None = None
+        self._archive_path: Path | None = None
+        self._exit_stack = ExitStack()
+        self._tika_client: TikaClient | None = None
+        self._gotenberg_client: GotenbergClient | None = None
+
+    def __enter__(self) -> Self:
+        self._tika_client = self._exit_stack.enter_context(
+            TikaClient(
+                tika_url=settings.TIKA_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ),
+        )
+        self._gotenberg_client = self._exit_stack.enter_context(
+            GotenbergClient(
+                host=settings.TIKA_GOTENBERG_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ),
+        )
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        self._exit_stack.close()
+        logger.debug("Cleaning up temporary directory %s", self._tempdir)
+        shutil.rmtree(self._tempdir, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
+        """Send the document to Tika for text extraction and Gotenberg for PDF.
+
+        Because ``requires_pdf_rendition`` is True the PDF conversion is
+        always performed — the ``produce_archive`` flag is intentionally
+        ignored.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the document file to parse.
+        mime_type:
+            Detected MIME type of the document.
+        produce_archive:
+            Accepted for protocol compatibility but ignored; the PDF rendition
+            is always produced since the source format cannot be displayed
+            natively in the browser.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If Tika or Gotenberg returns an error.
+        """
+        if TYPE_CHECKING:
+            assert self._tika_client is not None
+
+        logger.info("Sending %s to Tika server", document_path)
+
+        try:
+            try:
+                parsed = self._tika_client.tika.as_text.from_file(
+                    document_path,
+                    mime_type,
+                )
+            except httpx.HTTPStatusError as err:
+                # Workaround https://issues.apache.org/jira/browse/TIKA-4110
+                # Tika fails with some files as multi-part form data
+                if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
+                    parsed = self._tika_client.tika.as_text.from_buffer(
+                        document_path.read_bytes(),
+                        mime_type,
+                    )
+                else:  # pragma: no cover
+                    raise
+        except Exception as err:
+            raise ParseError(
+                f"Could not parse {document_path} with tika server at "
+                f"{settings.TIKA_ENDPOINT}: {err}",
+            ) from err
+
+        self._text = parsed.content
+        if self._text is not None:
+            self._text = self._text.strip()
+
+        self._date = parsed.created
+        if self._date is not None and timezone.is_naive(self._date):
+            self._date = timezone.make_aware(self._date)
+
+        # Always convert — requires_pdf_rendition=True means the browser
+        # cannot display the source format natively.
+        self._archive_path = self._convert_to_pdf(document_path)
+
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
+
+    def get_text(self) -> str | None:
+        """Return the plain-text content extracted during parse.
+
+        Returns
+        -------
+        str | None
+            Extracted text, or None if parse has not been called yet.
+        """
+        return self._text
+
+    def get_date(self) -> datetime.datetime | None:
+        """Return the document date detected during parse.
+
+        Returns
+        -------
+        datetime.datetime | None
+            Creation date from Tika metadata, or None if not detected.
+        """
+        return self._date
+
+    def get_archive_path(self) -> Path | None:
+        """Return the path to the generated PDF rendition, or None.
+
+        Returns
+        -------
+        Path | None
+            Path to the PDF produced by Gotenberg, or None if parse has not
+            been called yet.
+        """
+        return self._archive_path
+
+    # ------------------------------------------------------------------
+    # Thumbnail and metadata
+    # ------------------------------------------------------------------
+
+    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
+        """Generate a thumbnail from the PDF rendition of the document.
+
+        Converts the document to PDF first if not already done.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        Path
+            Path to the generated WebP thumbnail inside the temporary directory.
+        """
+        if self._archive_path is None:
+            self._archive_path = self._convert_to_pdf(document_path)
+        return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
+
+    def get_page_count(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> int | None:
+        """Return the number of pages in the document.
+
+        Returns
+        -------
+        int | None
+            Always None — page count is not available from Tika.
+        """
+        return None
+
+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract format-specific metadata via the Tika metadata endpoint.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            All key/value pairs returned by Tika, or ``[]`` on error.
+        """
+        if TYPE_CHECKING:
+            assert self._tika_client is not None
+
+        try:
+            parsed = self._tika_client.metadata.from_file(document_path, mime_type)
+            return [
+                {
+                    "namespace": "",
+                    "prefix": "",
+                    "key": key,
+                    "value": parsed.data[key],
+                }
+                for key in parsed.data
+            ]
+        except Exception as e:
+            logger.warning(
+                "Error while fetching document metadata for %s: %s",
+                document_path,
+                e,
+            )
+            return []
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    def _convert_to_pdf(self, document_path: Path) -> Path:
+        """Convert the document to PDF using Gotenberg's LibreOffice route.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+
+        Returns
+        -------
+        Path
+            Path to the generated PDF inside the temporary directory.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If Gotenberg returns an error.
+        """
+        if TYPE_CHECKING:
+            assert self._gotenberg_client is not None
+
+        pdf_path = self._tempdir / "convert.pdf"
+
+        logger.info("Converting %s to PDF as %s", document_path, pdf_path)
+
+        with self._gotenberg_client.libre_office.to_pdf() as route:
+            # Set the output format of the resulting PDF.
+            # OutputTypeConfig reads the database-stored ApplicationConfiguration
+            # first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
+            output_type = OutputTypeConfig().output_type
+            if output_type in {
+                OutputTypeChoices.PDF_A,
+                OutputTypeChoices.PDF_A2,
+            }:
+                route.pdf_format(PdfAFormat.A2b)
+            elif output_type == OutputTypeChoices.PDF_A1:
+                logger.warning(
+                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
+                )
+                route.pdf_format(PdfAFormat.A2b)
+            elif output_type == OutputTypeChoices.PDF_A3:
+                route.pdf_format(PdfAFormat.A3b)
+
+            route.convert(document_path)
+
+            try:
+                response = route.run()
+                pdf_path.write_bytes(response.content)
+                return pdf_path
+            except Exception as err:
+                raise ParseError(
+                    f"Error while converting document to PDF: {err}",
+                ) from err
--- a/src/paperless/tests/parsers/conftest.py
+++ b/src/paperless/tests/parsers/conftest.py
@@ -11,6 +11,7 @@ from typing import TYPE_CHECKING
 import pytest

 from paperless.parsers.text import TextDocumentParser
+from paperless.parsers.tika import TikaDocumentParser

 if TYPE_CHECKING:
    from collections.abc import Generator
@@ -74,3 +75,86 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
    """
    with TextDocumentParser() as parser:
        yield parser
+
+
+# ------------------------------------------------------------------
+# Tika parser sample files
+# ------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def tika_samples_dir(samples_dir: Path) -> Path:
+    """Absolute path to the Tika parser sample files directory.
+
+    Returns
+    -------
+    Path
+        ``<samples_dir>/tika/``
+    """
+    return samples_dir / "tika"
+
+
+@pytest.fixture(scope="session")
+def sample_odt_file(tika_samples_dir: Path) -> Path:
+    """Path to a sample ODT file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tika/sample.odt``.
+    """
+    return tika_samples_dir / "sample.odt"
+
+
+@pytest.fixture(scope="session")
+def sample_docx_file(tika_samples_dir: Path) -> Path:
+    """Path to a sample DOCX file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tika/sample.docx``.
+    """
+    return tika_samples_dir / "sample.docx"
+
+
+@pytest.fixture(scope="session")
+def sample_doc_file(tika_samples_dir: Path) -> Path:
+    """Path to a sample DOC file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tika/sample.doc``.
+    """
+    return tika_samples_dir / "sample.doc"
+
+
+@pytest.fixture(scope="session")
+def sample_broken_odt(tika_samples_dir: Path) -> Path:
+    """Path to a broken ODT file that triggers the multi-part fallback.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tika/multi-part-broken.odt``.
+    """
+    return tika_samples_dir / "multi-part-broken.odt"
+
+
+# ------------------------------------------------------------------
+# Tika parser instance
+# ------------------------------------------------------------------
+
+
+@pytest.fixture()
+def tika_parser() -> Generator[TikaDocumentParser, None, None]:
+    """Yield a TikaDocumentParser and clean up its temporary directory afterwards.
+
+    Yields
+    ------
+    TikaDocumentParser
+        A ready-to-use parser instance.
+    """
+    with TikaDocumentParser() as parser:
+        yield parser
--- a/src/paperless/tests/parsers/test_tika_liva.py
+++ b/src/paperless/tests/parsers/test_tika_liva.py
@@ -4,7 +4,7 @@ from pathlib import Path
 import pytest

 from documents.tests.utils import util_call_with_backoff
-from paperless_tika.parsers import TikaDocumentParser
+from paperless.parsers.tika import TikaDocumentParser


@pytest.mark.skipif(
@@ -42,14 +42,15 @@ class TestTikaParserAgainstServer:
        )

        assert (
-            tika_parser.text
+            tika_parser.get_text()
            == "This is an ODT test document, created September 14, 2022"
        )
-        assert tika_parser.archive_path is not None
-        assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
+        archive = tika_parser.get_archive_path()
+        assert archive is not None
+        assert b"PDF-" in archive.read_bytes()[:10]

        # TODO: Unsure what can set the Creation-Date field in a document, enable when possible
-        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))

    def test_basic_parse_docx(
        self,
@@ -74,14 +75,15 @@ class TestTikaParserAgainstServer:
        )

        assert (
-            tika_parser.text
+            tika_parser.get_text()
            == "This is an DOCX test document, also made September 14, 2022"
        )
-        assert tika_parser.archive_path is not None
-        with Path(tika_parser.archive_path).open("rb") as f:
+        archive = tika_parser.get_archive_path()
+        assert archive is not None
+        with archive.open("rb") as f:
            assert b"PDF-" in f.read()[:10]

-        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))

    def test_basic_parse_doc(
        self,
@@ -102,13 +104,12 @@ class TestTikaParserAgainstServer:
            [sample_doc_file, "application/msword"],
        )

-        assert tika_parser.text is not None
-        assert (
-            "This is a test document, saved in the older .doc format"
-            in tika_parser.text
-        )
-        assert tika_parser.archive_path is not None
-        with Path(tika_parser.archive_path).open("rb") as f:
+        text = tika_parser.get_text()
+        assert text is not None
+        assert "This is a test document, saved in the older .doc format" in text
+        archive = tika_parser.get_archive_path()
+        assert archive is not None
+        with archive.open("rb") as f:
            assert b"PDF-" in f.read()[:10]

    def test_tika_fails_multi_part(
@@ -133,6 +134,7 @@ class TestTikaParserAgainstServer:
            [sample_broken_odt, "application/vnd.oasis.opendocument.text"],
        )

-        assert tika_parser.archive_path is not None
-        with Path(tika_parser.archive_path).open("rb") as f:
+        archive = tika_parser.get_archive_path()
+        assert archive is not None
+        with archive.open("rb") as f:
            assert b"PDF-" in f.read()[:10]
--- a/src/paperless/tests/parsers/test_tika_parser.py
+++ b/src/paperless/tests/parsers/test_tika_parser.py
@@ -9,7 +9,56 @@ from pytest_django.fixtures import SettingsWrapper
 from pytest_httpx import HTTPXMock

 from documents.parsers import ParseError
-from paperless_tika.parsers import TikaDocumentParser
+from paperless.parsers import ParserProtocol
+from paperless.parsers.tika import TikaDocumentParser
+
+
+class TestTikaParserRegistryInterface:
+    """Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
+
+    def test_satisfies_parser_protocol(self) -> None:
+        assert isinstance(TikaDocumentParser(), ParserProtocol)
+
+    def test_supported_mime_types_is_classmethod(self) -> None:
+        mime_types = TikaDocumentParser.supported_mime_types()
+        assert isinstance(mime_types, dict)
+        assert len(mime_types) > 0
+
+    def test_score_returns_none_when_tika_disabled(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.TIKA_ENABLED = False
+        result = TikaDocumentParser.score(
+            "application/vnd.oasis.opendocument.text",
+            "sample.odt",
+        )
+        assert result is None
+
+    def test_score_returns_int_when_tika_enabled(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.TIKA_ENABLED = True
+        result = TikaDocumentParser.score(
+            "application/vnd.oasis.opendocument.text",
+            "sample.odt",
+        )
+        assert isinstance(result, int)
+
+    def test_score_returns_none_for_unsupported_mime(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.TIKA_ENABLED = True
+        result = TikaDocumentParser.score("application/pdf", "doc.pdf")
+        assert result is None
+
+    def test_can_produce_archive_is_false(self) -> None:
+        assert TikaDocumentParser().can_produce_archive is False
+
+    def test_requires_pdf_rendition_is_true(self) -> None:
+        assert TikaDocumentParser().requires_pdf_rendition is True


@pytest.mark.django_db()
@@ -36,12 +85,12 @@ class TestTikaParser:

        tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")

-        assert tika_parser.text == "the content"
-        assert tika_parser.archive_path is not None
-        with Path(tika_parser.archive_path).open("rb") as f:
+        assert tika_parser.get_text() == "the content"
+        assert tika_parser.get_archive_path() is not None
+        with Path(tika_parser.get_archive_path()).open("rb") as f:
            assert f.read() == b"PDF document"

-        assert tika_parser.date == datetime.datetime(
+        assert tika_parser.get_date() == datetime.datetime(
            2020,
            11,
            21,
@@ -89,7 +138,7 @@ class TestTikaParser:
        httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)

        with pytest.raises(ParseError):
-            tika_parser.convert_to_pdf(sample_odt_file, None)
+            tika_parser._convert_to_pdf(sample_odt_file)

    @pytest.mark.parametrize(
        ("setting_value", "expected_form_value"),
@@ -106,7 +155,6 @@ class TestTikaParser:
        expected_form_value: str,
        httpx_mock: HTTPXMock,
        settings: SettingsWrapper,
-        tika_parser: TikaDocumentParser,
        sample_odt_file: Path,
    ) -> None:
        """
@@ -117,6 +165,8 @@ class TestTikaParser:
        THEN:
            - Request to Gotenberg contains the expected PDF/A format string
        """
+        # Parser must be created after the setting is changed so that
+        # OutputTypeConfig reads the correct value at __init__ time.
        settings.OCR_OUTPUT_TYPE = setting_value
        httpx_mock.add_response(
            status_code=codes.OK,
@@ -124,7 +174,8 @@ class TestTikaParser:
            method="POST",
        )

-        tika_parser.convert_to_pdf(sample_odt_file, None)
+        with TikaDocumentParser() as parser:
+            parser._convert_to_pdf(sample_odt_file)

        request = httpx_mock.get_request()

--- a/src/paperless/tests/samples/tika/multi-part-broken.odt
+++ b/src/paperless/tests/samples/tika/multi-part-broken.odt
--- a/src/paperless/tests/samples/tika/sample.doc
+++ b/src/paperless/tests/samples/tika/sample.doc
--- a/src/paperless/tests/samples/tika/sample.docx
+++ b/src/paperless/tests/samples/tika/sample.docx
--- a/src/paperless/tests/samples/tika/sample.odt
+++ b/src/paperless/tests/samples/tika/sample.odt
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -1,10 +1,12 @@
 def get_parser(*args, **kwargs):
    from paperless.parsers.text import TextDocumentParser

-    # The new TextDocumentParser does not accept the legacy logging_group /
-    # progress_callback kwargs injected by the old signal-based consumer.
-    # These are dropped here; Phase 4 will replace this signal path with the
-    # new ParserRegistry so the shim can be removed at that point.
+    # TextDocumentParser accepts logging_group for constructor compatibility but
+    # does not store or use it (no legacy DocumentParser base class).
+    # progress_callback is also not used.  Both may arrive as a positional arg
+    # (consumer) or a keyword arg (views); *args absorbs the positional form,
+    # kwargs.pop handles the keyword form.  Phase 4 will replace this signal
+    # path with the new ParserRegistry so the shim can be removed at that point.
    kwargs.pop("logging_group", None)
    kwargs.pop("progress_callback", None)
    return TextDocumentParser()
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -1,136 +0,0 @@
-from pathlib import Path
-
-import httpx
-from django.conf import settings
-from django.utils import timezone
-from gotenberg_client import GotenbergClient
-from gotenberg_client.options import PdfAFormat
-from tika_client import TikaClient
-
-from documents.parsers import DocumentParser
-from documents.parsers import ParseError
-from documents.parsers import make_thumbnail_from_pdf
-from paperless.config import OutputTypeConfig
-from paperless.models import OutputTypeChoices
-
-
-class TikaDocumentParser(DocumentParser):
-    """
-    This parser sends documents to a local tika server
-    """
-
-    logging_name = "paperless.parsing.tika"
-
-    def get_thumbnail(self, document_path, mime_type, file_name=None):
-        if not self.archive_path:
-            self.archive_path = self.convert_to_pdf(document_path, file_name)
-
-        return make_thumbnail_from_pdf(
-            self.archive_path,
-            self.tempdir,
-            self.logging_group,
-        )
-
-    def extract_metadata(self, document_path, mime_type):
-        try:
-            with TikaClient(
-                tika_url=settings.TIKA_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client:
-                parsed = client.metadata.from_file(document_path, mime_type)
-                return [
-                    {
-                        "namespace": "",
-                        "prefix": "",
-                        "key": key,
-                        "value": parsed.data[key],
-                    }
-                    for key in parsed.data
-                ]
-        except Exception as e:
-            self.log.warning(
-                f"Error while fetching document metadata for {document_path}: {e}",
-            )
-            return []
-
-    def parse(self, document_path: Path, mime_type: str, file_name=None) -> None:
-        self.log.info(f"Sending {document_path} to Tika server")
-
-        try:
-            with TikaClient(
-                tika_url=settings.TIKA_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client:
-                try:
-                    parsed = client.tika.as_text.from_file(document_path, mime_type)
-                except httpx.HTTPStatusError as err:
-                    # Workaround https://issues.apache.org/jira/browse/TIKA-4110
-                    # Tika fails with some files as multi-part form data
-                    if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
-                        parsed = client.tika.as_text.from_buffer(
-                            document_path.read_bytes(),
-                            mime_type,
-                        )
-                    else:  # pragma: no cover
-                        raise
-        except Exception as err:
-            raise ParseError(
-                f"Could not parse {document_path} with tika server at "
-                f"{settings.TIKA_ENDPOINT}: {err}",
-            ) from err
-
-        self.text = parsed.content
-        if self.text is not None:
-            self.text = self.text.strip()
-
-        self.date = parsed.created
-        if self.date is not None and timezone.is_naive(self.date):
-            self.date = timezone.make_aware(self.date)
-
-        self.archive_path = self.convert_to_pdf(document_path, file_name)
-
-    def convert_to_pdf(self, document_path: Path, file_name):
-        pdf_path = Path(self.tempdir) / "convert.pdf"
-
-        self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
-
-        with (
-            GotenbergClient(
-                host=settings.TIKA_GOTENBERG_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client,
-            client.libre_office.to_pdf() as route,
-        ):
-            # Set the output format of the resulting PDF
-            if settings.OCR_OUTPUT_TYPE in {
-                OutputTypeChoices.PDF_A,
-                OutputTypeChoices.PDF_A2,
-            }:
-                route.pdf_format(PdfAFormat.A2b)
-            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
-                self.log.warning(
-                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
-                )
-                route.pdf_format(PdfAFormat.A2b)
-            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
-                route.pdf_format(PdfAFormat.A3b)
-
-            route.convert(document_path)
-
-            try:
-                response = route.run()
-
-                pdf_path.write_bytes(response.content)
-
-                return pdf_path
-
-            except Exception as err:
-                raise ParseError(
-                    f"Error while converting document to PDF: {err}",
-                ) from err
-
-    def get_settings(self) -> OutputTypeConfig:
-        """
-        This parser only uses the PDF output type configuration currently
-        """
-        return OutputTypeConfig()
--- a/src/paperless_tika/signals.py
+++ b/src/paperless_tika/signals.py
@@ -1,7 +1,15 @@
 def get_parser(*args, **kwargs):
-    from paperless_tika.parsers import TikaDocumentParser
+    from paperless.parsers.tika import TikaDocumentParser

-    return TikaDocumentParser(*args, **kwargs)
+    # TikaDocumentParser accepts logging_group for constructor compatibility but
+    # does not store or use it (no legacy DocumentParser base class).
+    # progress_callback is also not used.  Both may arrive as a positional arg
+    # (consumer) or a keyword arg (views); *args absorbs the positional form,
+    # kwargs.pop handles the keyword form.  Phase 4 will replace this signal
+    # path with the new ParserRegistry so the shim can be removed at that point.
+    kwargs.pop("logging_group", None)
+    kwargs.pop("progress_callback", None)
+    return TikaDocumentParser()


 def tika_consumer_declaration(sender, **kwargs):
--- a/src/paperless_tika/tests/init.py
+++ b/src/paperless_tika/tests/init.py
--- a/src/paperless_tika/tests/conftest.py
+++ b/src/paperless_tika/tests/conftest.py
@@ -1,41 +0,0 @@
-from collections.abc import Generator
-from pathlib import Path
-
-import pytest
-
-from paperless_tika.parsers import TikaDocumentParser
-
-
-@pytest.fixture()
-def tika_parser() -> Generator[TikaDocumentParser, None, None]:
-    try:
-        parser = TikaDocumentParser(logging_group=None)
-        yield parser
-    finally:
-        # TODO(stumpylog): Cleanup once all parsers are handled
-        parser.cleanup()
-
-
-@pytest.fixture(scope="session")
-def sample_dir() -> Path:
-    return (Path(__file__).parent / Path("samples")).resolve()
-
-
-@pytest.fixture(scope="session")
-def sample_odt_file(sample_dir: Path) -> Path:
-    return sample_dir / "sample.odt"
-
-
-@pytest.fixture(scope="session")
-def sample_docx_file(sample_dir: Path) -> Path:
-    return sample_dir / "sample.docx"
-
-
-@pytest.fixture(scope="session")
-def sample_doc_file(sample_dir: Path) -> Path:
-    return sample_dir / "sample.doc"
-
-
-@pytest.fixture(scope="session")
-def sample_broken_odt(sample_dir: Path) -> Path:
-    return sample_dir / "multi-part-broken.odt"
--- a/uv.lock
+++ b/uv.lock
@@ -5639,7 +5639,7 @@ wheels = [

 [[package]]
 name = "zensical"
-version = "0.0.24"
+version = "0.0.25"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -5649,18 +5649,16 @@ dependencies = [
    { name = "pymdown-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "pyyaml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/3b/96/9c6cbdd7b351d1023cdbbcf7872d4cb118b0334cfe5821b99e0dd18e3f00/zensical-0.0.24.tar.gz", hash = "sha256:b5d99e225329bf4f98c8022bdf0a0ee9588c2fada7b4df1b7b896fcc62b37ec3", size = 3840688, upload-time = "2026-02-26T09:43:44.557Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/18/69/4b49ce778059b4888ea854cf4db40e1b2080fe828b7280198999048d6fce/zensical-0.0.25.tar.gz", hash = "sha256:462808359d949469fa7209d367f2e38ed796744074e5dadeac9ddfef0c44be25", size = 3841318, upload-time = "2026-03-10T19:32:35.048Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/8e/aa/b8201af30e376a67566f044a1c56210edac5ae923fd986a836d2cf593c9c/zensical-0.0.24-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:d390c5453a5541ca35d4f9e1796df942b6612c546e3153dd928236d3b758409a", size = 12263407, upload-time = "2026-02-26T09:43:14.716Z" },
-    { url = "https://files.pythonhosted.org/packages/78/8e/3d910214471ade604fd39b080db3696864acc23678b5b4b8475c7dbfd2ce/zensical-0.0.24-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:81ac072869cf4d280853765b2bfb688653da0dfb9408f3ab15aca96455ab8223", size = 12142610, upload-time = "2026-02-26T09:43:17.546Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/d7/eb0983640aa0419ddf670298cfbcf8b75629b6484925429b857851e00784/zensical-0.0.24-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5eb1dfa84cae8e960bfa2c6851d2bc8e9710c4c4c683bd3aaf23185f646ae46", size = 12508380, upload-time = "2026-02-26T09:43:20.114Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/04/4405b9e6f937a75db19f0d875798a7eb70817d6a3bec2a2d289a2d5e8aea/zensical-0.0.24-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:57d7c9e589da99c1879a1c703e67c85eaa6be4661cdc6ce6534f7bb3575983f4", size = 12440807, upload-time = "2026-02-26T09:43:22.679Z" },
-    { url = "https://files.pythonhosted.org/packages/12/dc/a7ca2a4224b3072a2c2998b6611ad7fd4f8f131ceae7aa23238d97d26e22/zensical-0.0.24-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42fcc121c3095734b078a95a0dae4d4924fb8fbf16bf730456146ad6cab48ad0", size = 12782727, upload-time = "2026-02-26T09:43:25.347Z" },
-    { url = "https://files.pythonhosted.org/packages/42/37/22f1727da356ed3fcbd31f68d4a477f15c232997c87e270cfffb927459ac/zensical-0.0.24-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:832d4a2a051b9f49561031a2986ace502326f82d9a401ddf125530d30025fdd4", size = 12547616, upload-time = "2026-02-26T09:43:28.031Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/ff/c75ff111b8e12157901d00752beef9d691dbb5a034b6a77359972262416a/zensical-0.0.24-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e5fea3bb61238dba9f930f52669db67b0c26be98e1c8386a05eb2b1e3cb875dc", size = 12684883, upload-time = "2026-02-26T09:43:30.642Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/92/4f6ea066382e3d068d3cadbed99e9a71af25e46c84a403e0f747960472a2/zensical-0.0.24-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:75eef0428eec2958590633fdc82dc2a58af124879e29573aa7e153b662978073", size = 12713825, upload-time = "2026-02-26T09:43:33.273Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/fb/bf735b19bce0034b1f3b8e1c50b2896ebbd0c5d92d462777e759e78bb083/zensical-0.0.24-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:3c6b39659156394ff805b4831dac108c839483d9efa4c9b901eaa913efee1ac7", size = 12854318, upload-time = "2026-02-26T09:43:35.632Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/28/0ddab6c1237e3625e7763ff666806f31e5760bb36d18624135a6bb6e8643/zensical-0.0.24-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:9eef82865a18b3ca4c3cd13e245dff09a865d1da3c861e2fc86eaa9253a90f02", size = 12818270, upload-time = "2026-02-26T09:43:37.749Z" },
+    { url = "https://files.pythonhosted.org/packages/42/7c/f6f5eb1903b5a557d98f48d09e3d4bc33033ed78508986250dabe5387d91/zensical-0.0.25-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3c481dd16a968f97d43f6b596e10e941d8294ed446b8b117235a6b149c0d6965", size = 12263809, upload-time = "2026-03-10T19:31:49.907Z" },
+    { url = "https://files.pythonhosted.org/packages/37/b2/3f8be43526a68c52c84f099887d1903c2526a22aa4344378a72671bf6070/zensical-0.0.25-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ae51751e8b11f50df04641b40c1e07d4b703fed9d9548b16dbcb0cf260da229a", size = 12146107, upload-time = "2026-03-10T19:31:53.576Z" },
+    { url = "https://files.pythonhosted.org/packages/16/59/89a3a715b1fe538b4b5ee382d71b86bd06d4f351383e36eefd36e824c150/zensical-0.0.25-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56ccf88245bd0b3684bf313384164972f1890802d4a51dd9b7ae6ea126a810bc", size = 12505963, upload-time = "2026-03-10T19:31:57.517Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/5b/cc0bada291818bdf36be777af9c16f655a021f16578a31e6fb233affca03/zensical-0.0.25-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2f4e58bcc06f3e50cc518666a0c9d8f82246255a42b37bb1d7c7343e214fbac", size = 12455496, upload-time = "2026-03-10T19:32:02.37Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/16/ff91ee42d8b14a1b63e2e0d74922e6c4b0ec1da3819377f20b7ca2742f76/zensical-0.0.25-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:69895273b1319a45667abac543c3e5065ff2a646d9a698eae056b6a35b57e00a", size = 12683609, upload-time = "2026-03-10T19:32:06.144Z" },
+    { url = "https://files.pythonhosted.org/packages/01/fd/a85acc4234d31658f4bb54c4900edfc8d4227ad83e4c79de92cfdcd05c79/zensical-0.0.25-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:c51a00ae1de2e9647bfd0ea1965b223fb3891111a00930416e1277e06f3ab3c4", size = 12725420, upload-time = "2026-03-10T19:32:09.938Z" },
+    { url = "https://files.pythonhosted.org/packages/37/c7/896c91e457af3d5769d8d70d2bd66a8a287ad129879b51ab5e985ac68889/zensical-0.0.25-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:28e56ec1f06ea66227c1f5af9d7a6ed3bd4246e6af1e45d29e09f40251b52e1f", size = 12861970, upload-time = "2026-03-10T19:32:13.471Z" },
+    { url = "https://files.pythonhosted.org/packages/41/06/5d804cf19e4e093394674d9f213546dc1364a34fd85d81a1153b05733c5a/zensical-0.0.25-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d2d5997baad148b65eb0de6baf81973110538e01a3f64467d06d0c5ac23b0d70", size = 12816321, upload-time = "2026-03-10T19:32:17.031Z" },
 ]

 [[package]]