From 2b3361726209449e3b044ea01f971bec69049dce Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 12 Mar 2026 15:30:59 -0700 Subject: [PATCH] =?UTF-8?q?Feature:=20Phase=203=20=E2=80=94=20migrate=20Ti?= =?UTF-8?q?kaDocumentParser=20to=20ParserProtocol?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor TikaDocumentParser to satisfy ParserProtocol without subclassing the legacy DocumentParser ABC: - Add ClassVars: name, version, author, url - Add supported_mime_types() classmethod (12 Office/ODF/RTF MIME types) - Add score() classmethod — returns None when TIKA_ENABLED is False, 10 otherwise - can_produce_archive = False (PDF is for display, not an OCR archive) - requires_pdf_rendition = True (Office formats need PDF for browser display) - __enter__/__exit__ via ExitStack: TikaClient opened once per parser lifetime and shared across parse() and extract_metadata() calls - extract_metadata() falls back to a short-lived TikaClient when called outside a context manager (legacy view-layer metadata path) - _convert_to_pdf() uses OutputTypeConfig() to honour the database-stored ApplicationConfiguration before falling back to the env-var setting - Rename convert_to_pdf → _convert_to_pdf (private helper) Update paperless_tika/signals.py shim to import from the new module path and drop the legacy logging_group/progress_callback kwargs. Update documents/consumer.py to extend the existing TextDocumentParser special cases to also cover TikaDocumentParser (parse/get_thumbnail signatures, __exit__ cleanup). Add TestTikaParserRegistryInterface (7 tests) covering score(), properties, and ParserProtocol isinstance check. Update existing tests to use the new accessor API (get_text, get_date, get_archive_path, _convert_to_pdf). Co-Authored-By: Claude Sonnet 4.6 --- src/documents/consumer.py | 7 +- src/paperless/parsers/tika.py | 453 +++++++++++++++--- .../tests/parsers/test_tika_parser.py | 67 ++- src/paperless_tika/signals.py | 10 +- 4 files changed, 451 insertions(+), 86 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index cbc2198ef..fadd9a4e6 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -52,6 +52,7 @@ from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess from paperless.parsers.text import TextDocumentParser +from paperless.parsers.tika import TikaDocumentParser from paperless_mail.parsers import MailDocumentParser LOGGING_NAME: Final[str] = "paperless.consumer" @@ -67,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None: TODO(stumpylog): Remove me in the future """ - if isinstance(parser, TextDocumentParser): + if isinstance(parser, (TextDocumentParser, TikaDocumentParser)): parser.__exit__(None, None, None) else: parser.cleanup() @@ -476,7 +477,7 @@ class ConsumerPlugin( self.filename, self.input_doc.mailrule_id, ) - elif isinstance(document_parser, TextDocumentParser): + elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)): # TODO(stumpylog): Remove me in the future document_parser.parse(self.working_copy, mime_type) else: @@ -489,7 +490,7 @@ class ConsumerPlugin( ProgressStatusOptions.WORKING, ConsumerStatusShortMessage.GENERATING_THUMBNAIL, ) - if isinstance(document_parser, TextDocumentParser): + if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)): # TODO(stumpylog): Remove me in the future thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type) else: diff --git a/src/paperless/parsers/tika.py b/src/paperless/parsers/tika.py index 22a5bc1c6..467173e85 100644 --- a/src/paperless/parsers/tika.py +++ b/src/paperless/parsers/tika.py @@ -1,4 +1,21 @@ +""" +Built-in Tika document parser. + +Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by +sending them to an Apache Tika server for text extraction and a Gotenberg +server for PDF conversion. Because the source formats cannot be rendered by +a browser natively, the parser always produces a PDF rendition for display. +""" + +from __future__ import annotations + +import logging +import shutil +import tempfile +from contextlib import ExitStack from pathlib import Path +from typing import TYPE_CHECKING +from typing import Self import httpx from django.conf import settings @@ -7,92 +24,388 @@ from gotenberg_client import GotenbergClient from gotenberg_client.options import PdfAFormat from tika_client import TikaClient -from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf from paperless.config import OutputTypeConfig from paperless.models import OutputTypeChoices +from paperless.version import __full_version_str__ + +if TYPE_CHECKING: + import datetime + from types import TracebackType + + from paperless.parsers import MetadataEntry + +logger = logging.getLogger("paperless.parsing.tika") + +_SUPPORTED_MIME_TYPES: dict[str, str] = { + "application/msword": ".doc", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", + "application/vnd.ms-excel": ".xls", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", + "application/vnd.ms-powerpoint": ".ppt", + "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", + "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx", + "application/vnd.oasis.opendocument.presentation": ".odp", + "application/vnd.oasis.opendocument.spreadsheet": ".ods", + "application/vnd.oasis.opendocument.text": ".odt", + "application/vnd.oasis.opendocument.graphics": ".odg", + "text/rtf": ".rtf", +} -class TikaDocumentParser(DocumentParser): - """ - This parser sends documents to a local tika server +class TikaDocumentParser: + """Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx. + + Text extraction is handled by the Tika server. PDF conversion for display + is handled by Gotenberg (LibreOffice route). Because the source formats + cannot be rendered by a browser natively, ``requires_pdf_rendition`` is + True and the PDF is always produced regardless of the ``produce_archive`` + flag passed to ``parse``. + + The underlying ``TikaClient`` HTTP connection is opened once in + ``__enter__`` via an ``ExitStack`` and shared across ``parse`` and + ``extract_metadata`` calls, then closed in ``__exit__``. When the parser + is used without a context manager (e.g. the legacy view-layer metadata + path), ``extract_metadata`` falls back to creating a short-lived client + for that call only. + + Class attributes + ---------------- + name : str + Human-readable parser name. + version : str + Semantic version string, kept in sync with Paperless-ngx releases. + author : str + Maintainer name. + url : str + Issue tracker / source URL. """ - logging_name = "paperless.parsing.tika" + name: str = "Paperless-ngx Tika Parser" + version: str = __full_version_str__ + author: str = "Paperless-ngx Contributors" + url: str = "https://github.com/paperless-ngx/paperless-ngx" - def get_thumbnail(self, document_path, mime_type, file_name=None): - if not self.archive_path: - self.archive_path = self.convert_to_pdf(document_path, file_name) + # ------------------------------------------------------------------ + # Class methods + # ------------------------------------------------------------------ - return make_thumbnail_from_pdf( - self.archive_path, - self.tempdir, - self.logging_group, + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + """Return the MIME types this parser handles. + + Returns + ------- + dict[str, str] + Mapping of MIME type to preferred file extension. + """ + return _SUPPORTED_MIME_TYPES + + @classmethod + def score( + cls, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> int | None: + """Return the priority score for handling this file. + + Returns ``None`` when Tika integration is disabled so the registry + skips this parser entirely. + + Parameters + ---------- + mime_type: + Detected MIME type of the file. + filename: + Original filename including extension. + path: + Optional filesystem path. Not inspected by this parser. + + Returns + ------- + int | None + 10 if TIKA_ENABLED and the MIME type is supported, otherwise None. + """ + if not settings.TIKA_ENABLED: + return None + if mime_type in _SUPPORTED_MIME_TYPES: + return 10 + return None + + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def can_produce_archive(self) -> bool: + """Whether this parser can produce a searchable PDF archive copy. + + Returns + ------- + bool + Always False — Tika produces a display PDF, not an OCR archive. + """ + return False + + @property + def requires_pdf_rendition(self) -> bool: + """Whether the parser must produce a PDF for the frontend to display. + + Returns + ------- + bool + Always True — Office formats cannot be rendered natively in a + browser, so a PDF conversion is always required for display. + """ + return True + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def __init__(self, logging_group: object = None) -> None: + settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) + self._tempdir = Path( + tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), ) + self._text: str | None = None + self._date: datetime.datetime | None = None + self._archive_path: Path | None = None + self._exit_stack = ExitStack() + self._tika_client: TikaClient | None = None - def extract_metadata(self, document_path, mime_type): - try: - with TikaClient( + def __enter__(self) -> Self: + self._tika_client = self._exit_stack.enter_context( + TikaClient( tika_url=settings.TIKA_ENDPOINT, timeout=settings.CELERY_TASK_TIME_LIMIT, - ) as client: - parsed = client.metadata.from_file(document_path, mime_type) - return [ - { - "namespace": "", - "prefix": "", - "key": key, - "value": parsed.data[key], - } - for key in parsed.data - ] - except Exception as e: - self.log.warning( - f"Error while fetching document metadata for {document_path}: {e}", - ) - return [] + ), + ) + return self - def parse(self, document_path: Path, mime_type: str, file_name=None) -> None: - self.log.info(f"Sending {document_path} to Tika server") + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + self._exit_stack.__exit__(exc_type, exc_val, exc_tb) + logger.debug("Cleaning up temporary directory %s", self._tempdir) + shutil.rmtree(self._tempdir, ignore_errors=True) + + # ------------------------------------------------------------------ + # Core parsing interface + # ------------------------------------------------------------------ + + def parse( + self, + document_path: Path, + mime_type: str, + *, + produce_archive: bool = True, + ) -> None: + """Send the document to Tika for text extraction and Gotenberg for PDF. + + Because ``requires_pdf_rendition`` is True the PDF conversion is + always performed — the ``produce_archive`` flag is intentionally + ignored. + + Parameters + ---------- + document_path: + Absolute path to the document file to parse. + mime_type: + Detected MIME type of the document. + produce_archive: + Accepted for protocol compatibility but ignored; the PDF rendition + is always produced since the source format cannot be displayed + natively in the browser. + + Raises + ------ + documents.parsers.ParseError + If Tika or Gotenberg returns an error. + """ + logger.info("Sending %s to Tika server", document_path) try: - with TikaClient( - tika_url=settings.TIKA_ENDPOINT, - timeout=settings.CELERY_TASK_TIME_LIMIT, - ) as client: - try: - parsed = client.tika.as_text.from_file(document_path, mime_type) - except httpx.HTTPStatusError as err: - # Workaround https://issues.apache.org/jira/browse/TIKA-4110 - # Tika fails with some files as multi-part form data - if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR: - parsed = client.tika.as_text.from_buffer( - document_path.read_bytes(), - mime_type, - ) - else: # pragma: no cover - raise + try: + parsed = self._tika_client.tika.as_text.from_file( + document_path, + mime_type, + ) + except httpx.HTTPStatusError as err: + # Workaround https://issues.apache.org/jira/browse/TIKA-4110 + # Tika fails with some files as multi-part form data + if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR: + parsed = self._tika_client.tika.as_text.from_buffer( + document_path.read_bytes(), + mime_type, + ) + else: # pragma: no cover + raise except Exception as err: raise ParseError( f"Could not parse {document_path} with tika server at " f"{settings.TIKA_ENDPOINT}: {err}", ) from err - self.text = parsed.content - if self.text is not None: - self.text = self.text.strip() + self._text = parsed.content + if self._text is not None: + self._text = self._text.strip() - self.date = parsed.created - if self.date is not None and timezone.is_naive(self.date): - self.date = timezone.make_aware(self.date) + self._date = parsed.created + if self._date is not None and timezone.is_naive(self._date): + self._date = timezone.make_aware(self._date) - self.archive_path = self.convert_to_pdf(document_path, file_name) + # Always convert — requires_pdf_rendition=True means the browser + # cannot display the source format natively. + self._archive_path = self._convert_to_pdf(document_path) - def convert_to_pdf(self, document_path: Path, file_name): - pdf_path = Path(self.tempdir) / "convert.pdf" + # ------------------------------------------------------------------ + # Result accessors + # ------------------------------------------------------------------ - self.log.info(f"Converting {document_path} to PDF as {pdf_path}") + def get_text(self) -> str | None: + """Return the plain-text content extracted during parse. + + Returns + ------- + str | None + Extracted text, or None if parse has not been called yet. + """ + return self._text + + def get_date(self) -> datetime.datetime | None: + """Return the document date detected during parse. + + Returns + ------- + datetime.datetime | None + Creation date from Tika metadata, or None if not detected. + """ + return self._date + + def get_archive_path(self) -> Path | None: + """Return the path to the generated PDF rendition, or None. + + Returns + ------- + Path | None + Path to the PDF produced by Gotenberg, or None if parse has not + been called yet. + """ + return self._archive_path + + # ------------------------------------------------------------------ + # Thumbnail and metadata + # ------------------------------------------------------------------ + + def get_thumbnail(self, document_path: Path, mime_type: str) -> Path: + """Generate a thumbnail from the PDF rendition of the document. + + Converts the document to PDF first if not already done. + + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + + Returns + ------- + Path + Path to the generated WebP thumbnail inside the temporary directory. + """ + if self._archive_path is None: + self._archive_path = self._convert_to_pdf(document_path) + return make_thumbnail_from_pdf(self._archive_path, self._tempdir) + + def get_page_count( + self, + document_path: Path, + mime_type: str, + ) -> int | None: + """Return the number of pages in the document. + + Returns + ------- + int | None + Always None — page count is not available from Tika. + """ + return None + + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list[MetadataEntry]: + """Extract format-specific metadata via the Tika metadata endpoint. + + When the parser is used as a context manager, the shared + ``TikaClient`` opened in ``__enter__`` is reused. When called + outside a context manager (e.g. the legacy view-layer metadata path), + a short-lived ``TikaClient`` is created for this call only. + + Returns + ------- + list[MetadataEntry] + All key/value pairs returned by Tika, or ``[]`` on error. + """ + try: + if self._tika_client is not None: + parsed = self._tika_client.metadata.from_file(document_path, mime_type) + else: + with TikaClient( + tika_url=settings.TIKA_ENDPOINT, + timeout=settings.CELERY_TASK_TIME_LIMIT, + ) as client: + parsed = client.metadata.from_file(document_path, mime_type) + return [ + { + "namespace": "", + "prefix": "", + "key": key, + "value": parsed.data[key], + } + for key in parsed.data + ] + except Exception as e: + logger.warning( + "Error while fetching document metadata for %s: %s", + document_path, + e, + ) + return [] + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + def _convert_to_pdf(self, document_path: Path) -> Path: + """Convert the document to PDF using Gotenberg's LibreOffice route. + + Parameters + ---------- + document_path: + Absolute path to the source document. + + Returns + ------- + Path + Path to the generated PDF inside the temporary directory. + + Raises + ------ + documents.parsers.ParseError + If Gotenberg returns an error. + """ + pdf_path = self._tempdir / "convert.pdf" + + logger.info("Converting %s to PDF as %s", document_path, pdf_path) with ( GotenbergClient( @@ -101,36 +414,30 @@ class TikaDocumentParser(DocumentParser): ) as client, client.libre_office.to_pdf() as route, ): - # Set the output format of the resulting PDF - if settings.OCR_OUTPUT_TYPE in { + # Set the output format of the resulting PDF. + # OutputTypeConfig reads the database-stored ApplicationConfiguration + # first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var. + output_type = OutputTypeConfig().output_type + if output_type in { OutputTypeChoices.PDF_A, OutputTypeChoices.PDF_A2, }: route.pdf_format(PdfAFormat.A2b) - elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: - self.log.warning( + elif output_type == OutputTypeChoices.PDF_A1: + logger.warning( "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead", ) route.pdf_format(PdfAFormat.A2b) - elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: + elif output_type == OutputTypeChoices.PDF_A3: route.pdf_format(PdfAFormat.A3b) route.convert(document_path) try: response = route.run() - pdf_path.write_bytes(response.content) - return pdf_path - except Exception as err: raise ParseError( f"Error while converting document to PDF: {err}", ) from err - - def get_settings(self) -> OutputTypeConfig: - """ - This parser only uses the PDF output type configuration currently - """ - return OutputTypeConfig() diff --git a/src/paperless/tests/parsers/test_tika_parser.py b/src/paperless/tests/parsers/test_tika_parser.py index 2cf39da59..d18d178e6 100644 --- a/src/paperless/tests/parsers/test_tika_parser.py +++ b/src/paperless/tests/parsers/test_tika_parser.py @@ -5,11 +5,60 @@ from pathlib import Path import pytest from httpx import codes -from paperless_tika.parsers import TikaDocumentParser from pytest_django.fixtures import SettingsWrapper from pytest_httpx import HTTPXMock from documents.parsers import ParseError +from paperless.parsers import ParserProtocol +from paperless.parsers.tika import TikaDocumentParser + + +class TestTikaParserRegistryInterface: + """Verify that TikaDocumentParser satisfies the ParserProtocol contract.""" + + def test_satisfies_parser_protocol(self) -> None: + assert isinstance(TikaDocumentParser(), ParserProtocol) + + def test_supported_mime_types_is_classmethod(self) -> None: + mime_types = TikaDocumentParser.supported_mime_types() + assert isinstance(mime_types, dict) + assert len(mime_types) > 0 + + def test_score_returns_none_when_tika_disabled( + self, + settings: SettingsWrapper, + ) -> None: + settings.TIKA_ENABLED = False + result = TikaDocumentParser.score( + "application/vnd.oasis.opendocument.text", + "sample.odt", + ) + assert result is None + + def test_score_returns_int_when_tika_enabled( + self, + settings: SettingsWrapper, + ) -> None: + settings.TIKA_ENABLED = True + result = TikaDocumentParser.score( + "application/vnd.oasis.opendocument.text", + "sample.odt", + ) + assert isinstance(result, int) + + def test_score_returns_none_for_unsupported_mime( + self, + settings: SettingsWrapper, + ) -> None: + settings.TIKA_ENABLED = True + result = TikaDocumentParser.score("application/pdf", "doc.pdf") + assert result is None + + def test_can_produce_archive_is_false(self) -> None: + assert TikaDocumentParser().can_produce_archive is False + + def test_requires_pdf_rendition_is_true(self) -> None: + assert TikaDocumentParser().requires_pdf_rendition is True @pytest.mark.django_db() @@ -36,12 +85,12 @@ class TestTikaParser: tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text") - assert tika_parser.text == "the content" - assert tika_parser.archive_path is not None - with Path(tika_parser.archive_path).open("rb") as f: + assert tika_parser.get_text() == "the content" + assert tika_parser.get_archive_path() is not None + with Path(tika_parser.get_archive_path()).open("rb") as f: assert f.read() == b"PDF document" - assert tika_parser.date == datetime.datetime( + assert tika_parser.get_date() == datetime.datetime( 2020, 11, 21, @@ -89,7 +138,7 @@ class TestTikaParser: httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) with pytest.raises(ParseError): - tika_parser.convert_to_pdf(sample_odt_file, None) + tika_parser._convert_to_pdf(sample_odt_file) @pytest.mark.parametrize( ("setting_value", "expected_form_value"), @@ -106,7 +155,6 @@ class TestTikaParser: expected_form_value: str, httpx_mock: HTTPXMock, settings: SettingsWrapper, - tika_parser: TikaDocumentParser, sample_odt_file: Path, ) -> None: """ @@ -117,6 +165,8 @@ class TestTikaParser: THEN: - Request to Gotenberg contains the expected PDF/A format string """ + # Parser must be created after the setting is changed so that + # OutputTypeConfig reads the correct value at __init__ time. settings.OCR_OUTPUT_TYPE = setting_value httpx_mock.add_response( status_code=codes.OK, @@ -124,7 +174,8 @@ class TestTikaParser: method="POST", ) - tika_parser.convert_to_pdf(sample_odt_file, None) + with TikaDocumentParser() as parser: + parser._convert_to_pdf(sample_odt_file) request = httpx_mock.get_request() diff --git a/src/paperless_tika/signals.py b/src/paperless_tika/signals.py index 2b7495e35..412b80bb7 100644 --- a/src/paperless_tika/signals.py +++ b/src/paperless_tika/signals.py @@ -1,7 +1,13 @@ def get_parser(*args, **kwargs): - from paperless_tika.parsers import TikaDocumentParser + from paperless.parsers.tika import TikaDocumentParser - return TikaDocumentParser(*args, **kwargs) + # The new TikaDocumentParser does not accept the legacy logging_group / + # progress_callback kwargs injected by the old signal-based consumer. + # These are dropped here; Phase 4 will replace this signal path with the + # new ParserRegistry so the shim can be removed at that point. + kwargs.pop("logging_group", None) + kwargs.pop("progress_callback", None) + return TikaDocumentParser() def tika_consumer_declaration(sender, **kwargs):