Fix: require context manager for TikaDocumentParser; clean up client lifecycle

- consumer.py: call __enter__ for new-style parsers so _tika_client and _gotenberg_client are set before parse() is invoked - views.py: use `with parser` (via nullcontext for old-style parsers) in get_metadata so extract_metadata always runs inside a context manager - tika.py: GotenbergClient added to ExitStack alongside TikaClient; inline client creation removed from extract_metadata and _convert_to_pdf; __exit__ uses ExitStack.close() instead of __exit__ pass-through Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-21 12:54:22 +00:00 · 2026-03-12 19:29:37 -07:00
parent c16bcb7fef
commit 89d00247f6
3 changed files with 32 additions and 28 deletions
@@ -449,6 +449,12 @@ class ConsumerPlugin(
            progress_callback=progress_callback,
        )

+        # New-style parsers use __enter__/__exit__ for resource management.
+        # _parser_cleanup (below) handles __exit__; call __enter__ here.
+        # TODO(stumpylog): Remove me in the future
+        if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+            document_parser.__enter__()
+
        self.log.debug(f"Parser: {type(document_parser).__name__}")

        # Parse the document. This may take some time.
@@ -7,6 +7,7 @@ import tempfile
 import zipfile
 from collections import defaultdict
 from collections import deque
+from contextlib import nullcontext
 from datetime import datetime
 from pathlib import Path
 from time import mktime
@@ -225,6 +226,7 @@ from paperless.celery import app as celery_app
 from paperless.config import AIConfig
 from paperless.config import GeneralConfig
 from paperless.models import ApplicationConfiguration
+from paperless.parsers import ParserProtocol
 from paperless.serialisers import GroupSerializer
 from paperless.serialisers import UserSerializer
 from paperless.views import StandardPagination
@@ -1084,9 +1086,11 @@ class DocumentViewSet(
        parser_class = get_parser_class_for_mime_type(mime_type)
        if parser_class:
            parser = parser_class(progress_callback=None, logging_group=None)
+            cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)

            try:
-                return parser.extract_metadata(file, mime_type)
+                with cm:
+                    return parser.extract_metadata(file, mime_type)
            except Exception:  # pragma: no cover
                logger.exception(f"Issue getting metadata for {file}")
                # TODO: cover GPG errors, remove later.
@@ -63,12 +63,11 @@ class TikaDocumentParser:
    True and the PDF is always produced regardless of the ``produce_archive``
    flag passed to ``parse``.

-    The underlying ``TikaClient`` HTTP connection is opened once in
-    ``__enter__`` via an ``ExitStack`` and shared across ``parse`` and
-    ``extract_metadata`` calls, then closed in ``__exit__``.  When the parser
-    is used without a context manager (e.g. the legacy view-layer metadata
-    path), ``extract_metadata`` falls back to creating a short-lived client
-    for that call only.
+    Both ``TikaClient`` and ``GotenbergClient`` are opened once in
+    ``__enter__`` via an ``ExitStack`` and shared across ``parse``,
+    ``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
+    ``ExitStack.close()`` in ``__exit__``.  The parser must always be used
+    as a context manager.

    Class attributes
    ----------------
@@ -175,6 +174,7 @@ class TikaDocumentParser:
        self._archive_path: Path | None = None
        self._exit_stack = ExitStack()
        self._tika_client: TikaClient | None = None
+        self._gotenberg_client: GotenbergClient | None = None

    def __enter__(self) -> Self:
        self._tika_client = self._exit_stack.enter_context(
@@ -183,6 +183,12 @@ class TikaDocumentParser:
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ),
        )
+        self._gotenberg_client = self._exit_stack.enter_context(
+            GotenbergClient(
+                host=settings.TIKA_GOTENBERG_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ),
+        )
        return self

    def __exit__(
@@ -191,7 +197,7 @@ class TikaDocumentParser:
        exc_val: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
-        self._exit_stack.__exit__(exc_type, exc_val, exc_tb)
+        self._exit_stack.close()
        logger.debug("Cleaning up temporary directory %s", self._tempdir)
        shutil.rmtree(self._tempdir, ignore_errors=True)

@@ -348,25 +354,16 @@ class TikaDocumentParser:
    ) -> list[MetadataEntry]:
        """Extract format-specific metadata via the Tika metadata endpoint.

-        When the parser is used as a context manager, the shared
-        ``TikaClient`` opened in ``__enter__`` is reused.  When called
-        outside a context manager (e.g. the legacy view-layer metadata path),
-        a short-lived ``TikaClient`` is created for this call only.
-
        Returns
        -------
        list[MetadataEntry]
            All key/value pairs returned by Tika, or ``[]`` on error.
        """
+        if TYPE_CHECKING:
+            assert self._tika_client is not None
+
        try:
-            if self._tika_client is not None:
-                parsed = self._tika_client.metadata.from_file(document_path, mime_type)
-            else:
-                with TikaClient(
-                    tika_url=settings.TIKA_ENDPOINT,
-                    timeout=settings.CELERY_TASK_TIME_LIMIT,
-                ) as client:
-                    parsed = client.metadata.from_file(document_path, mime_type)
+            parsed = self._tika_client.metadata.from_file(document_path, mime_type)
            return [
                {
                    "namespace": "",
@@ -406,17 +403,14 @@ class TikaDocumentParser:
        documents.parsers.ParseError
            If Gotenberg returns an error.
        """
+        if TYPE_CHECKING:
+            assert self._gotenberg_client is not None
+
        pdf_path = self._tempdir / "convert.pdf"

        logger.info("Converting %s to PDF as %s", document_path, pdf_path)

-        with (
-            GotenbergClient(
-                host=settings.TIKA_GOTENBERG_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client,
-            client.libre_office.to_pdf() as route,
-        ):
+        with self._gotenberg_client.libre_office.to_pdf() as route:
            # Set the output format of the resulting PDF.
            # OutputTypeConfig reads the database-stored ApplicationConfiguration
            # first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.