Fix: require context manager for TikaDocumentParser; clean up client lifecycle

- consumer.py: call __enter__ for new-style parsers so _tika_client and
  _gotenberg_client are set before parse() is invoked
- views.py: use `with parser` (via nullcontext for old-style parsers) in
  get_metadata so extract_metadata always runs inside a context manager
- tika.py: GotenbergClient added to ExitStack alongside TikaClient;
  inline client creation removed from extract_metadata and _convert_to_pdf;
  __exit__ uses ExitStack.close() instead of __exit__ pass-through

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-12 19:29:37 -07:00
parent c16bcb7fef
commit 89d00247f6
3 changed files with 32 additions and 28 deletions
+6
View File
@@ -449,6 +449,12 @@ class ConsumerPlugin(
progress_callback=progress_callback,
)
# New-style parsers use __enter__/__exit__ for resource management.
# _parser_cleanup (below) handles __exit__; call __enter__ here.
# TODO(stumpylog): Remove me in the future
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
document_parser.__enter__()
self.log.debug(f"Parser: {type(document_parser).__name__}")
# Parse the document. This may take some time.
+5 -1
View File
@@ -7,6 +7,7 @@ import tempfile
import zipfile
from collections import defaultdict
from collections import deque
from contextlib import nullcontext
from datetime import datetime
from pathlib import Path
from time import mktime
@@ -225,6 +226,7 @@ from paperless.celery import app as celery_app
from paperless.config import AIConfig
from paperless.config import GeneralConfig
from paperless.models import ApplicationConfiguration
from paperless.parsers import ParserProtocol
from paperless.serialisers import GroupSerializer
from paperless.serialisers import UserSerializer
from paperless.views import StandardPagination
@@ -1084,9 +1086,11 @@ class DocumentViewSet(
parser_class = get_parser_class_for_mime_type(mime_type)
if parser_class:
parser = parser_class(progress_callback=None, logging_group=None)
cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
try:
return parser.extract_metadata(file, mime_type)
with cm:
return parser.extract_metadata(file, mime_type)
except Exception: # pragma: no cover
logger.exception(f"Issue getting metadata for {file}")
# TODO: cover GPG errors, remove later.
+21 -27
View File
@@ -63,12 +63,11 @@ class TikaDocumentParser:
True and the PDF is always produced regardless of the ``produce_archive``
flag passed to ``parse``.
The underlying ``TikaClient`` HTTP connection is opened once in
``__enter__`` via an ``ExitStack`` and shared across ``parse`` and
``extract_metadata`` calls, then closed in ``__exit__``. When the parser
is used without a context manager (e.g. the legacy view-layer metadata
path), ``extract_metadata`` falls back to creating a short-lived client
for that call only.
Both ``TikaClient`` and ``GotenbergClient`` are opened once in
``__enter__`` via an ``ExitStack`` and shared across ``parse``,
``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
``ExitStack.close()`` in ``__exit__``. The parser must always be used
as a context manager.
Class attributes
----------------
@@ -175,6 +174,7 @@ class TikaDocumentParser:
self._archive_path: Path | None = None
self._exit_stack = ExitStack()
self._tika_client: TikaClient | None = None
self._gotenberg_client: GotenbergClient | None = None
def __enter__(self) -> Self:
self._tika_client = self._exit_stack.enter_context(
@@ -183,6 +183,12 @@ class TikaDocumentParser:
timeout=settings.CELERY_TASK_TIME_LIMIT,
),
)
self._gotenberg_client = self._exit_stack.enter_context(
GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
),
)
return self
def __exit__(
@@ -191,7 +197,7 @@ class TikaDocumentParser:
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
self._exit_stack.__exit__(exc_type, exc_val, exc_tb)
self._exit_stack.close()
logger.debug("Cleaning up temporary directory %s", self._tempdir)
shutil.rmtree(self._tempdir, ignore_errors=True)
@@ -348,25 +354,16 @@ class TikaDocumentParser:
) -> list[MetadataEntry]:
"""Extract format-specific metadata via the Tika metadata endpoint.
When the parser is used as a context manager, the shared
``TikaClient`` opened in ``__enter__`` is reused. When called
outside a context manager (e.g. the legacy view-layer metadata path),
a short-lived ``TikaClient`` is created for this call only.
Returns
-------
list[MetadataEntry]
All key/value pairs returned by Tika, or ``[]`` on error.
"""
if TYPE_CHECKING:
assert self._tika_client is not None
try:
if self._tika_client is not None:
parsed = self._tika_client.metadata.from_file(document_path, mime_type)
else:
with TikaClient(
tika_url=settings.TIKA_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client:
parsed = client.metadata.from_file(document_path, mime_type)
parsed = self._tika_client.metadata.from_file(document_path, mime_type)
return [
{
"namespace": "",
@@ -406,17 +403,14 @@ class TikaDocumentParser:
documents.parsers.ParseError
If Gotenberg returns an error.
"""
if TYPE_CHECKING:
assert self._gotenberg_client is not None
pdf_path = self._tempdir / "convert.pdf"
logger.info("Converting %s to PDF as %s", document_path, pdf_path)
with (
GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client,
client.libre_office.to_pdf() as route,
):
with self._gotenberg_client.libre_office.to_pdf() as route:
# Set the output format of the resulting PDF.
# OutputTypeConfig reads the database-stored ApplicationConfiguration
# first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.