From 89d00247f6bfe73c2672325b56d0538e0fdec71c Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 12 Mar 2026 19:29:37 -0700
Subject: [PATCH] Fix: require context manager for TikaDocumentParser; clean up
 client lifecycle

- consumer.py: call __enter__ for new-style parsers so _tika_client and
  _gotenberg_client are set before parse() is invoked
- views.py: use `with parser` (via nullcontext for old-style parsers) in
  get_metadata so extract_metadata always runs inside a context manager
- tika.py: GotenbergClient added to ExitStack alongside TikaClient;
  inline client creation removed from extract_metadata and _convert_to_pdf;
  __exit__ uses ExitStack.close() instead of __exit__ pass-through

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/documents/consumer.py     |  6 +++++
 src/documents/views.py        |  6 ++++-
 src/paperless/parsers/tika.py | 48 +++++++++++++++--------------------
 3 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index fadd9a4e6..53fd1ca0f 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -449,6 +449,12 @@ class ConsumerPlugin(
             progress_callback=progress_callback,
         )
 
+        # New-style parsers use __enter__/__exit__ for resource management.
+        # _parser_cleanup (below) handles __exit__; call __enter__ here.
+        # TODO(stumpylog): Remove me in the future
+        if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+            document_parser.__enter__()
+
         self.log.debug(f"Parser: {type(document_parser).__name__}")
 
         # Parse the document. This may take some time.
diff --git a/src/documents/views.py b/src/documents/views.py
index a69293ee9..9bbe51892 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -7,6 +7,7 @@ import tempfile
 import zipfile
 from collections import defaultdict
 from collections import deque
+from contextlib import nullcontext
 from datetime import datetime
 from pathlib import Path
 from time import mktime
@@ -225,6 +226,7 @@ from paperless.celery import app as celery_app
 from paperless.config import AIConfig
 from paperless.config import GeneralConfig
 from paperless.models import ApplicationConfiguration
+from paperless.parsers import ParserProtocol
 from paperless.serialisers import GroupSerializer
 from paperless.serialisers import UserSerializer
 from paperless.views import StandardPagination
@@ -1084,9 +1086,11 @@ class DocumentViewSet(
         parser_class = get_parser_class_for_mime_type(mime_type)
         if parser_class:
             parser = parser_class(progress_callback=None, logging_group=None)
+            cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
 
             try:
-                return parser.extract_metadata(file, mime_type)
+                with cm:
+                    return parser.extract_metadata(file, mime_type)
             except Exception:  # pragma: no cover
                 logger.exception(f"Issue getting metadata for {file}")
                 # TODO: cover GPG errors, remove later.
diff --git a/src/paperless/parsers/tika.py b/src/paperless/parsers/tika.py
index 2e73b85c9..b9307858a 100644
--- a/src/paperless/parsers/tika.py
+++ b/src/paperless/parsers/tika.py
@@ -63,12 +63,11 @@ class TikaDocumentParser:
     True and the PDF is always produced regardless of the ``produce_archive``
     flag passed to ``parse``.
 
-    The underlying ``TikaClient`` HTTP connection is opened once in
-    ``__enter__`` via an ``ExitStack`` and shared across ``parse`` and
-    ``extract_metadata`` calls, then closed in ``__exit__``.  When the parser
-    is used without a context manager (e.g. the legacy view-layer metadata
-    path), ``extract_metadata`` falls back to creating a short-lived client
-    for that call only.
+    Both ``TikaClient`` and ``GotenbergClient`` are opened once in
+    ``__enter__`` via an ``ExitStack`` and shared across ``parse``,
+    ``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
+    ``ExitStack.close()`` in ``__exit__``.  The parser must always be used
+    as a context manager.
 
     Class attributes
     ----------------
@@ -175,6 +174,7 @@ class TikaDocumentParser:
         self._archive_path: Path | None = None
         self._exit_stack = ExitStack()
         self._tika_client: TikaClient | None = None
+        self._gotenberg_client: GotenbergClient | None = None
 
     def __enter__(self) -> Self:
         self._tika_client = self._exit_stack.enter_context(
@@ -183,6 +183,12 @@ class TikaDocumentParser:
                 timeout=settings.CELERY_TASK_TIME_LIMIT,
             ),
         )
+        self._gotenberg_client = self._exit_stack.enter_context(
+            GotenbergClient(
+                host=settings.TIKA_GOTENBERG_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ),
+        )
         return self
 
     def __exit__(
@@ -191,7 +197,7 @@ class TikaDocumentParser:
         exc_val: BaseException | None,
         exc_tb: TracebackType | None,
     ) -> None:
-        self._exit_stack.__exit__(exc_type, exc_val, exc_tb)
+        self._exit_stack.close()
         logger.debug("Cleaning up temporary directory %s", self._tempdir)
         shutil.rmtree(self._tempdir, ignore_errors=True)
 
@@ -348,25 +354,16 @@ class TikaDocumentParser:
     ) -> list[MetadataEntry]:
         """Extract format-specific metadata via the Tika metadata endpoint.
 
-        When the parser is used as a context manager, the shared
-        ``TikaClient`` opened in ``__enter__`` is reused.  When called
-        outside a context manager (e.g. the legacy view-layer metadata path),
-        a short-lived ``TikaClient`` is created for this call only.
-
         Returns
         -------
         list[MetadataEntry]
             All key/value pairs returned by Tika, or ``[]`` on error.
         """
+        if TYPE_CHECKING:
+            assert self._tika_client is not None
+
         try:
-            if self._tika_client is not None:
-                parsed = self._tika_client.metadata.from_file(document_path, mime_type)
-            else:
-                with TikaClient(
-                    tika_url=settings.TIKA_ENDPOINT,
-                    timeout=settings.CELERY_TASK_TIME_LIMIT,
-                ) as client:
-                    parsed = client.metadata.from_file(document_path, mime_type)
+            parsed = self._tika_client.metadata.from_file(document_path, mime_type)
             return [
                 {
                     "namespace": "",
@@ -406,17 +403,14 @@ class TikaDocumentParser:
         documents.parsers.ParseError
             If Gotenberg returns an error.
         """
+        if TYPE_CHECKING:
+            assert self._gotenberg_client is not None
+
         pdf_path = self._tempdir / "convert.pdf"
 
         logger.info("Converting %s to PDF as %s", document_path, pdf_path)
 
-        with (
-            GotenbergClient(
-                host=settings.TIKA_GOTENBERG_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client,
-            client.libre_office.to_pdf() as route,
-        ):
+        with self._gotenberg_client.libre_office.to_pdf() as route:
             # Set the output format of the resulting PDF.
             # OutputTypeConfig reads the database-stored ApplicationConfiguration
             # first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.