From 89d00247f6bfe73c2672325b56d0538e0fdec71c Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 12 Mar 2026 19:29:37 -0700 Subject: [PATCH] Fix: require context manager for TikaDocumentParser; clean up client lifecycle - consumer.py: call __enter__ for new-style parsers so _tika_client and _gotenberg_client are set before parse() is invoked - views.py: use `with parser` (via nullcontext for old-style parsers) in get_metadata so extract_metadata always runs inside a context manager - tika.py: GotenbergClient added to ExitStack alongside TikaClient; inline client creation removed from extract_metadata and _convert_to_pdf; __exit__ uses ExitStack.close() instead of __exit__ pass-through Co-Authored-By: Claude Sonnet 4.6 --- src/documents/consumer.py | 6 +++++ src/documents/views.py | 6 ++++- src/paperless/parsers/tika.py | 48 +++++++++++++++-------------------- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index fadd9a4e6..53fd1ca0f 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -449,6 +449,12 @@ class ConsumerPlugin( progress_callback=progress_callback, ) + # New-style parsers use __enter__/__exit__ for resource management. + # _parser_cleanup (below) handles __exit__; call __enter__ here. + # TODO(stumpylog): Remove me in the future + if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)): + document_parser.__enter__() + self.log.debug(f"Parser: {type(document_parser).__name__}") # Parse the document. This may take some time. diff --git a/src/documents/views.py b/src/documents/views.py index a69293ee9..9bbe51892 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -7,6 +7,7 @@ import tempfile import zipfile from collections import defaultdict from collections import deque +from contextlib import nullcontext from datetime import datetime from pathlib import Path from time import mktime @@ -225,6 +226,7 @@ from paperless.celery import app as celery_app from paperless.config import AIConfig from paperless.config import GeneralConfig from paperless.models import ApplicationConfiguration +from paperless.parsers import ParserProtocol from paperless.serialisers import GroupSerializer from paperless.serialisers import UserSerializer from paperless.views import StandardPagination @@ -1084,9 +1086,11 @@ class DocumentViewSet( parser_class = get_parser_class_for_mime_type(mime_type) if parser_class: parser = parser_class(progress_callback=None, logging_group=None) + cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser) try: - return parser.extract_metadata(file, mime_type) + with cm: + return parser.extract_metadata(file, mime_type) except Exception: # pragma: no cover logger.exception(f"Issue getting metadata for {file}") # TODO: cover GPG errors, remove later. diff --git a/src/paperless/parsers/tika.py b/src/paperless/parsers/tika.py index 2e73b85c9..b9307858a 100644 --- a/src/paperless/parsers/tika.py +++ b/src/paperless/parsers/tika.py @@ -63,12 +63,11 @@ class TikaDocumentParser: True and the PDF is always produced regardless of the ``produce_archive`` flag passed to ``parse``. - The underlying ``TikaClient`` HTTP connection is opened once in - ``__enter__`` via an ``ExitStack`` and shared across ``parse`` and - ``extract_metadata`` calls, then closed in ``__exit__``. When the parser - is used without a context manager (e.g. the legacy view-layer metadata - path), ``extract_metadata`` falls back to creating a short-lived client - for that call only. + Both ``TikaClient`` and ``GotenbergClient`` are opened once in + ``__enter__`` via an ``ExitStack`` and shared across ``parse``, + ``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via + ``ExitStack.close()`` in ``__exit__``. The parser must always be used + as a context manager. Class attributes ---------------- @@ -175,6 +174,7 @@ class TikaDocumentParser: self._archive_path: Path | None = None self._exit_stack = ExitStack() self._tika_client: TikaClient | None = None + self._gotenberg_client: GotenbergClient | None = None def __enter__(self) -> Self: self._tika_client = self._exit_stack.enter_context( @@ -183,6 +183,12 @@ class TikaDocumentParser: timeout=settings.CELERY_TASK_TIME_LIMIT, ), ) + self._gotenberg_client = self._exit_stack.enter_context( + GotenbergClient( + host=settings.TIKA_GOTENBERG_ENDPOINT, + timeout=settings.CELERY_TASK_TIME_LIMIT, + ), + ) return self def __exit__( @@ -191,7 +197,7 @@ class TikaDocumentParser: exc_val: BaseException | None, exc_tb: TracebackType | None, ) -> None: - self._exit_stack.__exit__(exc_type, exc_val, exc_tb) + self._exit_stack.close() logger.debug("Cleaning up temporary directory %s", self._tempdir) shutil.rmtree(self._tempdir, ignore_errors=True) @@ -348,25 +354,16 @@ class TikaDocumentParser: ) -> list[MetadataEntry]: """Extract format-specific metadata via the Tika metadata endpoint. - When the parser is used as a context manager, the shared - ``TikaClient`` opened in ``__enter__`` is reused. When called - outside a context manager (e.g. the legacy view-layer metadata path), - a short-lived ``TikaClient`` is created for this call only. - Returns ------- list[MetadataEntry] All key/value pairs returned by Tika, or ``[]`` on error. """ + if TYPE_CHECKING: + assert self._tika_client is not None + try: - if self._tika_client is not None: - parsed = self._tika_client.metadata.from_file(document_path, mime_type) - else: - with TikaClient( - tika_url=settings.TIKA_ENDPOINT, - timeout=settings.CELERY_TASK_TIME_LIMIT, - ) as client: - parsed = client.metadata.from_file(document_path, mime_type) + parsed = self._tika_client.metadata.from_file(document_path, mime_type) return [ { "namespace": "", @@ -406,17 +403,14 @@ class TikaDocumentParser: documents.parsers.ParseError If Gotenberg returns an error. """ + if TYPE_CHECKING: + assert self._gotenberg_client is not None + pdf_path = self._tempdir / "convert.pdf" logger.info("Converting %s to PDF as %s", document_path, pdf_path) - with ( - GotenbergClient( - host=settings.TIKA_GOTENBERG_ENDPOINT, - timeout=settings.CELERY_TASK_TIME_LIMIT, - ) as client, - client.libre_office.to_pdf() as route, - ): + with self._gotenberg_client.libre_office.to_pdf() as route: # Set the output format of the resulting PDF. # OutputTypeConfig reads the database-stored ApplicationConfiguration # first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.