Merge branch 'dev' into feature-tika-plugin-conversion

Removes basically empty directory
Cleans up the comment, which wasn't quite right
2026-03-14 05:01:24 +00:00 · 2026-03-13 15:11:13 -07:00 · 2026-03-13 12:53:21 -07:00 · 2026-03-13 12:41:32 -07:00 · 2026-03-13 11:18:53 -07:00 · 2026-03-13 09:27:52 -07:00
19 changed files with 642 additions and 216 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -157,6 +157,9 @@ updates:
      postgres:
        patterns:
          - "docker.io/library/postgres*"
      greenmail:
        patterns:
          - "docker.io/greenmail*"
  - package-ecosystem: "pre-commit" # See documentation for possible values
    directory: "/" # Location of package manifests
    schedule:
--- a/docker/compose/docker-compose.ci-test.yml
+++ b/docker/compose/docker-compose.ci-test.yml
@@ -18,13 +18,13 @@ services:
      - "--log-level=warn"
      - "--log-format=text"
  tika:
-    image: docker.io/apache/tika:latest
+    image: docker.io/apache/tika:3.2.3.0
    hostname: tika
    container_name: tika
    network_mode: host
    restart: unless-stopped
  greenmail:
-    image: greenmail/standalone:2.1.8
+    image: docker.io/greenmail/standalone:2.1.8
    hostname: greenmail
    container_name: greenmail
    environment:
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -52,6 +52,7 @@ from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
 from paperless_mail.parsers import MailDocumentParser
 LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -67,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
    TODO(stumpylog): Remove me in the future
    """
-    if isinstance(parser, TextDocumentParser):
+    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
        parser.__exit__(None, None, None)
    else:
        parser.cleanup()
@@ -448,6 +449,12 @@ class ConsumerPlugin(
            progress_callback=progress_callback,
        )
        # New-style parsers use __enter__/__exit__ for resource management.
        # _parser_cleanup (below) handles __exit__; call __enter__ here.
        # TODO(stumpylog): Remove me in the future
        if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
            document_parser.__enter__()
        self.log.debug(f"Parser: {type(document_parser).__name__}")
        # Parse the document. This may take some time.
@@ -476,7 +483,7 @@ class ConsumerPlugin(
                    self.filename,
                    self.input_doc.mailrule_id,
                )
-            elif isinstance(document_parser, TextDocumentParser):
+            elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
                # TODO(stumpylog): Remove me in the future
                document_parser.parse(self.working_copy, mime_type)
            else:
@@ -489,7 +496,7 @@ class ConsumerPlugin(
                ProgressStatusOptions.WORKING,
                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
            )
-            if isinstance(document_parser, TextDocumentParser):
+            if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
                # TODO(stumpylog): Remove me in the future
                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
            else:
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -10,8 +10,8 @@ from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import get_supported_file_extensions
 from documents.parsers import is_file_ext_supported
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
 from paperless_tesseract.parsers import RasterisedDocumentParser
 from paperless_tika.parsers import TikaDocumentParser
 class TestParserDiscovery(TestCase):
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -7,6 +7,7 @@ import tempfile
 import zipfile
 from collections import defaultdict
 from collections import deque
 from contextlib import nullcontext
 from datetime import datetime
 from pathlib import Path
 from time import mktime
@@ -225,6 +226,7 @@ from paperless.celery import app as celery_app
 from paperless.config import AIConfig
 from paperless.config import GeneralConfig
 from paperless.models import ApplicationConfiguration
 from paperless.parsers import ParserProtocol
 from paperless.serialisers import GroupSerializer
 from paperless.serialisers import UserSerializer
 from paperless.views import StandardPagination
@@ -1084,9 +1086,11 @@ class DocumentViewSet(
        parser_class = get_parser_class_for_mime_type(mime_type)
        if parser_class:
            parser = parser_class(progress_callback=None, logging_group=None)
            cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
            try:
-                return parser.extract_metadata(file, mime_type)
+                with cm:
                    return parser.extract_metadata(file, mime_type)
            except Exception:  # pragma: no cover
                logger.exception(f"Issue getting metadata for {file}")
                # TODO: cover GPG errors, remove later.
--- a/src/paperless/parsers/registry.py
+++ b/src/paperless/parsers/registry.py
@@ -194,8 +194,10 @@ class ParserRegistry:
        at runtime regardless of registration order.
        """
        from paperless.parsers.text import TextDocumentParser
        from paperless.parsers.tika import TikaDocumentParser
        self.register_builtin(TextDocumentParser)
        self.register_builtin(TikaDocumentParser)
    # ------------------------------------------------------------------
    # Discovery
--- a/src/paperless/parsers/tika.py
+++ b/src/paperless/parsers/tika.py
@@ -0,0 +1,440 @@
 """
 Built-in Tika document parser.
 Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
 sending them to an Apache Tika server for text extraction and a Gotenberg
 server for PDF conversion.  Because the source formats cannot be rendered by
 a browser natively, the parser always produces a PDF rendition for display.
 """
 from __future__ import annotations
 import logging
 import shutil
 import tempfile
 from contextlib import ExitStack
 from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import Self
 import httpx
 from django.conf import settings
 from django.utils import timezone
 from gotenberg_client import GotenbergClient
 from gotenberg_client.options import PdfAFormat
 from tika_client import TikaClient
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from paperless.config import OutputTypeConfig
 from paperless.models import OutputTypeChoices
 from paperless.version import __full_version_str__
 if TYPE_CHECKING:
    import datetime
    from types import TracebackType
    from paperless.parsers import MetadataEntry
 logger = logging.getLogger("paperless.parsing.tika")
 _SUPPORTED_MIME_TYPES: dict[str, str] = {
    "application/msword": ".doc",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
    "application/vnd.ms-excel": ".xls",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
    "application/vnd.ms-powerpoint": ".ppt",
    "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
    "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
    "application/vnd.oasis.opendocument.presentation": ".odp",
    "application/vnd.oasis.opendocument.spreadsheet": ".ods",
    "application/vnd.oasis.opendocument.text": ".odt",
    "application/vnd.oasis.opendocument.graphics": ".odg",
    "text/rtf": ".rtf",
 }
 class TikaDocumentParser:
    """Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
    Text extraction is handled by the Tika server.  PDF conversion for display
    is handled by Gotenberg (LibreOffice route).  Because the source formats
    cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
    True and the PDF is always produced regardless of the ``produce_archive``
    flag passed to ``parse``.
    Both ``TikaClient`` and ``GotenbergClient`` are opened once in
    ``__enter__`` via an ``ExitStack`` and shared across ``parse``,
    ``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
    ``ExitStack.close()`` in ``__exit__``.  The parser must always be used
    as a context manager.
    Class attributes
    ----------------
    name : str
        Human-readable parser name.
    version : str
        Semantic version string, kept in sync with Paperless-ngx releases.
    author : str
        Maintainer name.
    url : str
        Issue tracker / source URL.
    """
    name: str = "Paperless-ngx Tika Parser"
    version: str = __full_version_str__
    author: str = "Paperless-ngx Contributors"
    url: str = "https://github.com/paperless-ngx/paperless-ngx"
    # ------------------------------------------------------------------
    # Class methods
    # ------------------------------------------------------------------
    @classmethod
    def supported_mime_types(cls) -> dict[str, str]:
        """Return the MIME types this parser handles.
        Returns
        -------
        dict[str, str]
            Mapping of MIME type to preferred file extension.
        """
        return _SUPPORTED_MIME_TYPES
    @classmethod
    def score(
        cls,
        mime_type: str,
        filename: str,
        path: Path | None = None,
    ) -> int | None:
        """Return the priority score for handling this file.
        Returns ``None`` when Tika integration is disabled so the registry
        skips this parser entirely.
        Parameters
        ----------
        mime_type:
            Detected MIME type of the file.
        filename:
            Original filename including extension.
        path:
            Optional filesystem path. Not inspected by this parser.
        Returns
        -------
        int | None
            10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
        """
        if not settings.TIKA_ENABLED:
            return None
        if mime_type in _SUPPORTED_MIME_TYPES:
            return 10
        return None
    # ------------------------------------------------------------------
    # Properties
    # ------------------------------------------------------------------
    @property
    def can_produce_archive(self) -> bool:
        """Whether this parser can produce a searchable PDF archive copy.
        Returns
        -------
        bool
            Always False — Tika produces a display PDF, not an OCR archive.
        """
        return False
    @property
    def requires_pdf_rendition(self) -> bool:
        """Whether the parser must produce a PDF for the frontend to display.
        Returns
        -------
        bool
            Always True — Office formats cannot be rendered natively in a
            browser, so a PDF conversion is always required for display.
        """
        return True
    # ------------------------------------------------------------------
    # Lifecycle
    # ------------------------------------------------------------------
    def __init__(self, logging_group: object = None) -> None:
        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
        self._tempdir = Path(
            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
        )
        self._text: str | None = None
        self._date: datetime.datetime | None = None
        self._archive_path: Path | None = None
        self._exit_stack = ExitStack()
        self._tika_client: TikaClient | None = None
        self._gotenberg_client: GotenbergClient | None = None
    def __enter__(self) -> Self:
        self._tika_client = self._exit_stack.enter_context(
            TikaClient(
                tika_url=settings.TIKA_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ),
        )
        self._gotenberg_client = self._exit_stack.enter_context(
            GotenbergClient(
                host=settings.TIKA_GOTENBERG_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ),
        )
        return self
    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
        self._exit_stack.close()
        logger.debug("Cleaning up temporary directory %s", self._tempdir)
        shutil.rmtree(self._tempdir, ignore_errors=True)
    # ------------------------------------------------------------------
    # Core parsing interface
    # ------------------------------------------------------------------
    def parse(
        self,
        document_path: Path,
        mime_type: str,
        *,
        produce_archive: bool = True,
    ) -> None:
        """Send the document to Tika for text extraction and Gotenberg for PDF.
        Because ``requires_pdf_rendition`` is True the PDF conversion is
        always performed — the ``produce_archive`` flag is intentionally
        ignored.
        Parameters
        ----------
        document_path:
            Absolute path to the document file to parse.
        mime_type:
            Detected MIME type of the document.
        produce_archive:
            Accepted for protocol compatibility but ignored; the PDF rendition
            is always produced since the source format cannot be displayed
            natively in the browser.
        Raises
        ------
        documents.parsers.ParseError
            If Tika or Gotenberg returns an error.
        """
        if TYPE_CHECKING:
            assert self._tika_client is not None
        logger.info("Sending %s to Tika server", document_path)
        try:
            try:
                parsed = self._tika_client.tika.as_text.from_file(
                    document_path,
                    mime_type,
                )
            except httpx.HTTPStatusError as err:
                # Workaround https://issues.apache.org/jira/browse/TIKA-4110
                # Tika fails with some files as multi-part form data
                if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
                    parsed = self._tika_client.tika.as_text.from_buffer(
                        document_path.read_bytes(),
                        mime_type,
                    )
                else:  # pragma: no cover
                    raise
        except Exception as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server at "
                f"{settings.TIKA_ENDPOINT}: {err}",
            ) from err
        self._text = parsed.content
        if self._text is not None:
            self._text = self._text.strip()
        self._date = parsed.created
        if self._date is not None and timezone.is_naive(self._date):
            self._date = timezone.make_aware(self._date)
        # Always convert — requires_pdf_rendition=True means the browser
        # cannot display the source format natively.
        self._archive_path = self._convert_to_pdf(document_path)
    # ------------------------------------------------------------------
    # Result accessors
    # ------------------------------------------------------------------
    def get_text(self) -> str | None:
        """Return the plain-text content extracted during parse.
        Returns
        -------
        str | None
            Extracted text, or None if parse has not been called yet.
        """
        return self._text
    def get_date(self) -> datetime.datetime | None:
        """Return the document date detected during parse.
        Returns
        -------
        datetime.datetime | None
            Creation date from Tika metadata, or None if not detected.
        """
        return self._date
    def get_archive_path(self) -> Path | None:
        """Return the path to the generated PDF rendition, or None.
        Returns
        -------
        Path | None
            Path to the PDF produced by Gotenberg, or None if parse has not
            been called yet.
        """
        return self._archive_path
    # ------------------------------------------------------------------
    # Thumbnail and metadata
    # ------------------------------------------------------------------
    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
        """Generate a thumbnail from the PDF rendition of the document.
        Converts the document to PDF first if not already done.
        Parameters
        ----------
        document_path:
            Absolute path to the source document.
        mime_type:
            Detected MIME type of the document.
        Returns
        -------
        Path
            Path to the generated WebP thumbnail inside the temporary directory.
        """
        if self._archive_path is None:
            self._archive_path = self._convert_to_pdf(document_path)
        return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
    def get_page_count(
        self,
        document_path: Path,
        mime_type: str,
    ) -> int | None:
        """Return the number of pages in the document.
        Returns
        -------
        int | None
            Always None — page count is not available from Tika.
        """
        return None
    def extract_metadata(
        self,
        document_path: Path,
        mime_type: str,
    ) -> list[MetadataEntry]:
        """Extract format-specific metadata via the Tika metadata endpoint.
        Returns
        -------
        list[MetadataEntry]
            All key/value pairs returned by Tika, or ``[]`` on error.
        """
        if TYPE_CHECKING:
            assert self._tika_client is not None
        try:
            parsed = self._tika_client.metadata.from_file(document_path, mime_type)
            return [
                {
                    "namespace": "",
                    "prefix": "",
                    "key": key,
                    "value": parsed.data[key],
                }
                for key in parsed.data
            ]
        except Exception as e:
            logger.warning(
                "Error while fetching document metadata for %s: %s",
                document_path,
                e,
            )
            return []
    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------
    def _convert_to_pdf(self, document_path: Path) -> Path:
        """Convert the document to PDF using Gotenberg's LibreOffice route.
        Parameters
        ----------
        document_path:
            Absolute path to the source document.
        Returns
        -------
        Path
            Path to the generated PDF inside the temporary directory.
        Raises
        ------
        documents.parsers.ParseError
            If Gotenberg returns an error.
        """
        if TYPE_CHECKING:
            assert self._gotenberg_client is not None
        pdf_path = self._tempdir / "convert.pdf"
        logger.info("Converting %s to PDF as %s", document_path, pdf_path)
        with self._gotenberg_client.libre_office.to_pdf() as route:
            # Set the output format of the resulting PDF.
            # OutputTypeConfig reads the database-stored ApplicationConfiguration
            # first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
            output_type = OutputTypeConfig().output_type
            if output_type in {
                OutputTypeChoices.PDF_A,
                OutputTypeChoices.PDF_A2,
            }:
                route.pdf_format(PdfAFormat.A2b)
            elif output_type == OutputTypeChoices.PDF_A1:
                logger.warning(
                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
                )
                route.pdf_format(PdfAFormat.A2b)
            elif output_type == OutputTypeChoices.PDF_A3:
                route.pdf_format(PdfAFormat.A3b)
            route.convert(document_path)
            try:
                response = route.run()
                pdf_path.write_bytes(response.content)
                return pdf_path
            except Exception as err:
                raise ParseError(
                    f"Error while converting document to PDF: {err}",
                ) from err
--- a/src/paperless/tests/parsers/conftest.py
+++ b/src/paperless/tests/parsers/conftest.py
@@ -11,6 +11,7 @@ from typing import TYPE_CHECKING
 import pytest
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
 if TYPE_CHECKING:
    from collections.abc import Generator
@@ -74,3 +75,86 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
    """
    with TextDocumentParser() as parser:
        yield parser
 # ------------------------------------------------------------------
 # Tika parser sample files
 # ------------------------------------------------------------------
@pytest.fixture(scope="session")
 def tika_samples_dir(samples_dir: Path) -> Path:
    """Absolute path to the Tika parser sample files directory.
    Returns
    -------
    Path
        ``<samples_dir>/tika/``
    """
    return samples_dir / "tika"
@pytest.fixture(scope="session")
 def sample_odt_file(tika_samples_dir: Path) -> Path:
    """Path to a sample ODT file.
    Returns
    -------
    Path
        Absolute path to ``tika/sample.odt``.
    """
    return tika_samples_dir / "sample.odt"
@pytest.fixture(scope="session")
 def sample_docx_file(tika_samples_dir: Path) -> Path:
    """Path to a sample DOCX file.
    Returns
    -------
    Path
        Absolute path to ``tika/sample.docx``.
    """
    return tika_samples_dir / "sample.docx"
@pytest.fixture(scope="session")
 def sample_doc_file(tika_samples_dir: Path) -> Path:
    """Path to a sample DOC file.
    Returns
    -------
    Path
        Absolute path to ``tika/sample.doc``.
    """
    return tika_samples_dir / "sample.doc"
@pytest.fixture(scope="session")
 def sample_broken_odt(tika_samples_dir: Path) -> Path:
    """Path to a broken ODT file that triggers the multi-part fallback.
    Returns
    -------
    Path
        Absolute path to ``tika/multi-part-broken.odt``.
    """
    return tika_samples_dir / "multi-part-broken.odt"
 # ------------------------------------------------------------------
 # Tika parser instance
 # ------------------------------------------------------------------
@pytest.fixture()
 def tika_parser() -> Generator[TikaDocumentParser, None, None]:
    """Yield a TikaDocumentParser and clean up its temporary directory afterwards.
    Yields
    ------
    TikaDocumentParser
        A ready-to-use parser instance.
    """
    with TikaDocumentParser() as parser:
        yield parser
--- a/src/paperless/tests/parsers/test_tika_liva.py
+++ b/src/paperless/tests/parsers/test_tika_liva.py
@@ -4,7 +4,7 @@ from pathlib import Path
 import pytest
 from documents.tests.utils import util_call_with_backoff
-from paperless_tika.parsers import TikaDocumentParser
+from paperless.parsers.tika import TikaDocumentParser
@pytest.mark.skipif(
@@ -42,14 +42,15 @@ class TestTikaParserAgainstServer:
        )
        assert (
-            tika_parser.text
+            tika_parser.get_text()
            == "This is an ODT test document, created September 14, 2022"
        )
-        assert tika_parser.archive_path is not None
+        archive = tika_parser.get_archive_path()
-        assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
+        assert archive is not None
        assert b"PDF-" in archive.read_bytes()[:10]
        # TODO: Unsure what can set the Creation-Date field in a document, enable when possible
-        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
    def test_basic_parse_docx(
        self,
@@ -74,14 +75,15 @@ class TestTikaParserAgainstServer:
        )
        assert (
-            tika_parser.text
+            tika_parser.get_text()
            == "This is an DOCX test document, also made September 14, 2022"
        )
-        assert tika_parser.archive_path is not None
+        archive = tika_parser.get_archive_path()
-        with Path(tika_parser.archive_path).open("rb") as f:
+        assert archive is not None
        with archive.open("rb") as f:
            assert b"PDF-" in f.read()[:10]
-        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
    def test_basic_parse_doc(
        self,
@@ -102,13 +104,12 @@ class TestTikaParserAgainstServer:
            [sample_doc_file, "application/msword"],
        )
-        assert tika_parser.text is not None
+        text = tika_parser.get_text()
-        assert (
+        assert text is not None
-            "This is a test document, saved in the older .doc format"
+        assert "This is a test document, saved in the older .doc format" in text
-            in tika_parser.text
+        archive = tika_parser.get_archive_path()
-        )
+        assert archive is not None
-        assert tika_parser.archive_path is not None
+        with archive.open("rb") as f:
        with Path(tika_parser.archive_path).open("rb") as f:
            assert b"PDF-" in f.read()[:10]
    def test_tika_fails_multi_part(
@@ -133,6 +134,7 @@ class TestTikaParserAgainstServer:
            [sample_broken_odt, "application/vnd.oasis.opendocument.text"],
        )
-        assert tika_parser.archive_path is not None
+        archive = tika_parser.get_archive_path()
-        with Path(tika_parser.archive_path).open("rb") as f:
+        assert archive is not None
        with archive.open("rb") as f:
            assert b"PDF-" in f.read()[:10]
--- a/src/paperless/tests/parsers/test_tika_parser.py
+++ b/src/paperless/tests/parsers/test_tika_parser.py
@@ -9,7 +9,56 @@ from pytest_django.fixtures import SettingsWrapper
 from pytest_httpx import HTTPXMock
 from documents.parsers import ParseError
-from paperless_tika.parsers import TikaDocumentParser
+from paperless.parsers import ParserProtocol
 from paperless.parsers.tika import TikaDocumentParser
 class TestTikaParserRegistryInterface:
    """Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
    def test_satisfies_parser_protocol(self) -> None:
        assert isinstance(TikaDocumentParser(), ParserProtocol)
    def test_supported_mime_types_is_classmethod(self) -> None:
        mime_types = TikaDocumentParser.supported_mime_types()
        assert isinstance(mime_types, dict)
        assert len(mime_types) > 0
    def test_score_returns_none_when_tika_disabled(
        self,
        settings: SettingsWrapper,
    ) -> None:
        settings.TIKA_ENABLED = False
        result = TikaDocumentParser.score(
            "application/vnd.oasis.opendocument.text",
            "sample.odt",
        )
        assert result is None
    def test_score_returns_int_when_tika_enabled(
        self,
        settings: SettingsWrapper,
    ) -> None:
        settings.TIKA_ENABLED = True
        result = TikaDocumentParser.score(
            "application/vnd.oasis.opendocument.text",
            "sample.odt",
        )
        assert isinstance(result, int)
    def test_score_returns_none_for_unsupported_mime(
        self,
        settings: SettingsWrapper,
    ) -> None:
        settings.TIKA_ENABLED = True
        result = TikaDocumentParser.score("application/pdf", "doc.pdf")
        assert result is None
    def test_can_produce_archive_is_false(self) -> None:
        assert TikaDocumentParser().can_produce_archive is False
    def test_requires_pdf_rendition_is_true(self) -> None:
        assert TikaDocumentParser().requires_pdf_rendition is True
@pytest.mark.django_db()
@@ -36,12 +85,12 @@ class TestTikaParser:
        tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
-        assert tika_parser.text == "the content"
+        assert tika_parser.get_text() == "the content"
-        assert tika_parser.archive_path is not None
+        assert tika_parser.get_archive_path() is not None
-        with Path(tika_parser.archive_path).open("rb") as f:
+        with Path(tika_parser.get_archive_path()).open("rb") as f:
            assert f.read() == b"PDF document"
-        assert tika_parser.date == datetime.datetime(
+        assert tika_parser.get_date() == datetime.datetime(
            2020,
            11,
            21,
@@ -89,7 +138,7 @@ class TestTikaParser:
        httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
        with pytest.raises(ParseError):
-            tika_parser.convert_to_pdf(sample_odt_file, None)
+            tika_parser._convert_to_pdf(sample_odt_file)
    @pytest.mark.parametrize(
        ("setting_value", "expected_form_value"),
@@ -106,7 +155,6 @@ class TestTikaParser:
        expected_form_value: str,
        httpx_mock: HTTPXMock,
        settings: SettingsWrapper,
        tika_parser: TikaDocumentParser,
        sample_odt_file: Path,
    ) -> None:
        """
@@ -117,6 +165,8 @@ class TestTikaParser:
        THEN:
            - Request to Gotenberg contains the expected PDF/A format string
        """
        # Parser must be created after the setting is changed so that
        # OutputTypeConfig reads the correct value at __init__ time.
        settings.OCR_OUTPUT_TYPE = setting_value
        httpx_mock.add_response(
            status_code=codes.OK,
@@ -124,7 +174,8 @@ class TestTikaParser:
            method="POST",
        )
-        tika_parser.convert_to_pdf(sample_odt_file, None)
+        with TikaDocumentParser() as parser:
            parser._convert_to_pdf(sample_odt_file)
        request = httpx_mock.get_request()
--- a/src/paperless/tests/samples/tika/multi-part-broken.odt
+++ b/src/paperless/tests/samples/tika/multi-part-broken.odt
--- a/src/paperless/tests/samples/tika/sample.doc
+++ b/src/paperless/tests/samples/tika/sample.doc
--- a/src/paperless/tests/samples/tika/sample.docx
+++ b/src/paperless/tests/samples/tika/sample.docx
--- a/src/paperless/tests/samples/tika/sample.odt
+++ b/src/paperless/tests/samples/tika/sample.odt
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -1,10 +1,12 @@
 def get_parser(*args, **kwargs):
    from paperless.parsers.text import TextDocumentParser
-    # The new TextDocumentParser does not accept the legacy logging_group /
+    # TextDocumentParser accepts logging_group for constructor compatibility but
-    # progress_callback kwargs injected by the old signal-based consumer.
+    # does not store or use it (no legacy DocumentParser base class).
-    # These are dropped here; Phase 4 will replace this signal path with the
+    # progress_callback is also not used.  Both may arrive as a positional arg
-    # new ParserRegistry so the shim can be removed at that point.
+    # (consumer) or a keyword arg (views); *args absorbs the positional form,
    # kwargs.pop handles the keyword form.  Phase 4 will replace this signal
    # path with the new ParserRegistry so the shim can be removed at that point.
    kwargs.pop("logging_group", None)
    kwargs.pop("progress_callback", None)
    return TextDocumentParser()
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -1,136 +0,0 @@
 from pathlib import Path
 import httpx
 from django.conf import settings
 from django.utils import timezone
 from gotenberg_client import GotenbergClient
 from gotenberg_client.options import PdfAFormat
 from tika_client import TikaClient
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from paperless.config import OutputTypeConfig
 from paperless.models import OutputTypeChoices
 class TikaDocumentParser(DocumentParser):
    """
    This parser sends documents to a local tika server
    """
    logging_name = "paperless.parsing.tika"
    def get_thumbnail(self, document_path, mime_type, file_name=None):
        if not self.archive_path:
            self.archive_path = self.convert_to_pdf(document_path, file_name)
        return make_thumbnail_from_pdf(
            self.archive_path,
            self.tempdir,
            self.logging_group,
        )
    def extract_metadata(self, document_path, mime_type):
        try:
            with TikaClient(
                tika_url=settings.TIKA_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ) as client:
                parsed = client.metadata.from_file(document_path, mime_type)
                return [
                    {
                        "namespace": "",
                        "prefix": "",
                        "key": key,
                        "value": parsed.data[key],
                    }
                    for key in parsed.data
                ]
        except Exception as e:
            self.log.warning(
                f"Error while fetching document metadata for {document_path}: {e}",
            )
            return []
    def parse(self, document_path: Path, mime_type: str, file_name=None) -> None:
        self.log.info(f"Sending {document_path} to Tika server")
        try:
            with TikaClient(
                tika_url=settings.TIKA_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ) as client:
                try:
                    parsed = client.tika.as_text.from_file(document_path, mime_type)
                except httpx.HTTPStatusError as err:
                    # Workaround https://issues.apache.org/jira/browse/TIKA-4110
                    # Tika fails with some files as multi-part form data
                    if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
                        parsed = client.tika.as_text.from_buffer(
                            document_path.read_bytes(),
                            mime_type,
                        )
                    else:  # pragma: no cover
                        raise
        except Exception as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server at "
                f"{settings.TIKA_ENDPOINT}: {err}",
            ) from err
        self.text = parsed.content
        if self.text is not None:
            self.text = self.text.strip()
        self.date = parsed.created
        if self.date is not None and timezone.is_naive(self.date):
            self.date = timezone.make_aware(self.date)
        self.archive_path = self.convert_to_pdf(document_path, file_name)
    def convert_to_pdf(self, document_path: Path, file_name):
        pdf_path = Path(self.tempdir) / "convert.pdf"
        self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
        with (
            GotenbergClient(
                host=settings.TIKA_GOTENBERG_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ) as client,
            client.libre_office.to_pdf() as route,
        ):
            # Set the output format of the resulting PDF
            if settings.OCR_OUTPUT_TYPE in {
                OutputTypeChoices.PDF_A,
                OutputTypeChoices.PDF_A2,
            }:
                route.pdf_format(PdfAFormat.A2b)
            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
                self.log.warning(
                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
                )
                route.pdf_format(PdfAFormat.A2b)
            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
                route.pdf_format(PdfAFormat.A3b)
            route.convert(document_path)
            try:
                response = route.run()
                pdf_path.write_bytes(response.content)
                return pdf_path
            except Exception as err:
                raise ParseError(
                    f"Error while converting document to PDF: {err}",
                ) from err
    def get_settings(self) -> OutputTypeConfig:
        """
        This parser only uses the PDF output type configuration currently
        """
        return OutputTypeConfig()
--- a/src/paperless_tika/signals.py
+++ b/src/paperless_tika/signals.py
@@ -1,7 +1,15 @@
 def get_parser(*args, **kwargs):
-    from paperless_tika.parsers import TikaDocumentParser
+    from paperless.parsers.tika import TikaDocumentParser
-    return TikaDocumentParser(*args, **kwargs)
+    # TikaDocumentParser accepts logging_group for constructor compatibility but
    # does not store or use it (no legacy DocumentParser base class).
    # progress_callback is also not used.  Both may arrive as a positional arg
    # (consumer) or a keyword arg (views); *args absorbs the positional form,
    # kwargs.pop handles the keyword form.  Phase 4 will replace this signal
    # path with the new ParserRegistry so the shim can be removed at that point.
    kwargs.pop("logging_group", None)
    kwargs.pop("progress_callback", None)
    return TikaDocumentParser()
 def tika_consumer_declaration(sender, **kwargs):
--- a/src/paperless_tika/tests/init.py
+++ b/src/paperless_tika/tests/init.py
--- a/src/paperless_tika/tests/conftest.py
+++ b/src/paperless_tika/tests/conftest.py
@@ -1,41 +0,0 @@
 from collections.abc import Generator
 from pathlib import Path
 import pytest
 from paperless_tika.parsers import TikaDocumentParser
@pytest.fixture()
 def tika_parser() -> Generator[TikaDocumentParser, None, None]:
    try:
        parser = TikaDocumentParser(logging_group=None)
        yield parser
    finally:
        # TODO(stumpylog): Cleanup once all parsers are handled
        parser.cleanup()
@pytest.fixture(scope="session")
 def sample_dir() -> Path:
    return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture(scope="session")
 def sample_odt_file(sample_dir: Path) -> Path:
    return sample_dir / "sample.odt"
@pytest.fixture(scope="session")
 def sample_docx_file(sample_dir: Path) -> Path:
    return sample_dir / "sample.docx"
@pytest.fixture(scope="session")
 def sample_doc_file(sample_dir: Path) -> Path:
    return sample_dir / "sample.doc"
@pytest.fixture(scope="session")
 def sample_broken_odt(sample_dir: Path) -> Path:
    return sample_dir / "multi-part-broken.odt"
Author	SHA1	Message	Date
Trenton H	b8069d24b1	Merge branch 'dev' into feature-tika-plugin-conversion	2026-03-13 15:11:13 -07:00
Trenton H	da06dd2c09	Removes basically empty directory	2026-03-13 12:53:21 -07:00
Trenton H	bc01e000ad	Cleans up the comment, which wasn't quite right	2026-03-13 12:41:32 -07:00
Trenton H	23b051b2ee	Locks down the Tika version and adds Greenmail for Dependabot as well	2026-03-13 11:18:53 -07:00
Trenton H	644a0f3c6b	Register the builtin Tika parser	2026-03-13 09:27:52 -07:00
Trenton H	dcf4402b15	Renames so it aligns better in the browser view	2026-03-13 09:27:48 -07:00
Trenton H	89d00247f6	Fix: require context manager for TikaDocumentParser; clean up client lifecycle - consumer.py: call __enter__ for new-style parsers so _tika_client and _gotenberg_client are set before parse() is invoked - views.py: use `with parser` (via nullcontext for old-style parsers) in get_metadata so extract_metadata always runs inside a context manager - tika.py: GotenbergClient added to ExitStack alongside TikaClient; inline client creation removed from extract_metadata and _convert_to_pdf; __exit__ uses ExitStack.close() instead of __exit__ pass-through Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 09:27:42 -07:00
Trenton H	c16bcb7fef	Fix: satisfy mypy and pyrefly for TikaDocumentParser Use a TYPE_CHECKING-guarded assert to narrow self._tika_client from TikaClient \| None to TikaClient at the point of use in parse(). The assert is visible to type checkers (TYPE_CHECKING=True) so both mypy and pyrefly accept the subsequent attribute accesses without error; at runtime TYPE_CHECKING is False so the assert never executes and no ruff S101 suppression is required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 09:27:38 -07:00
Trenton H	d0b95f2cda	Fix: update remaining imports and move live Tika tests after parser migration - src/documents/tests/test_parsers.py: import TikaDocumentParser from paperless.parsers.tika (old paperless_tika.parsers no longer exists) - git mv paperless_tika/tests/test_live_tika.py → paperless/tests/parsers/test_live_tika.py to co-locate all Tika tests with the parser; update import and replace old attribute API (tika_parser.text/.archive_path) with accessor methods (get_text/get_archive_path) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 09:27:29 -07:00
Trenton H	2b33617262	Feature: Phase 3 — migrate TikaDocumentParser to ParserProtocol Refactor TikaDocumentParser to satisfy ParserProtocol without subclassing the legacy DocumentParser ABC: - Add ClassVars: name, version, author, url - Add supported_mime_types() classmethod (12 Office/ODF/RTF MIME types) - Add score() classmethod — returns None when TIKA_ENABLED is False, 10 otherwise - can_produce_archive = False (PDF is for display, not an OCR archive) - requires_pdf_rendition = True (Office formats need PDF for browser display) - __enter__/__exit__ via ExitStack: TikaClient opened once per parser lifetime and shared across parse() and extract_metadata() calls - extract_metadata() falls back to a short-lived TikaClient when called outside a context manager (legacy view-layer metadata path) - _convert_to_pdf() uses OutputTypeConfig() to honour the database-stored ApplicationConfiguration before falling back to the env-var setting - Rename convert_to_pdf → _convert_to_pdf (private helper) Update paperless_tika/signals.py shim to import from the new module path and drop the legacy logging_group/progress_callback kwargs. Update documents/consumer.py to extend the existing TextDocumentParser special cases to also cover TikaDocumentParser (parse/get_thumbnail signatures, __exit__ cleanup). Add TestTikaParserRegistryInterface (7 tests) covering score(), properties, and ParserProtocol isinstance check. Update existing tests to use the new accessor API (get_text, get_date, get_archive_path, _convert_to_pdf). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 09:26:49 -07:00
Trenton H	0a9c67e9b1	Chore: move Tika parser and tests to paperless/ Move TikaDocumentParser and its tests to the canonical parser package location, matching the pattern established for TextDocumentParser: - src/paperless_tika/parsers.py → src/paperless/parsers/tika.py - src/paperless_tika/tests/test_tika_parser.py → src/paperless/tests/parsers/test_tika_parser.py - src/paperless_tika/tests/samples/ → src/paperless/tests/samples/tika/ Merge tika fixtures (tika_parser, sample_odt_file, sample_docx_file, sample_doc_file, sample_broken_odt) into the shared parsers conftest. Remove the now-empty src/paperless_tika/tests/conftest.py. Content is unchanged — this commit is rename-only so git history is preserved on the moved files. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 09:26:26 -07:00