From 8148f2ced2bc5ee11e202efef18d28ec2113ce31 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 19 Mar 2026 08:28:17 -0700 Subject: [PATCH] Feat(parsers): call configure(ParserContext()) in update_document task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the same new-style parser shim pattern as the consumer to update_document_content_maybe_archive_file: - Call __enter__ for Text/Tika parsers after instantiation - Call configure(ParserContext()) before parse() for all new-style parsers (mailrule_id is not available here — this is a re-process of an existing document, so the default empty context is correct) - Call parse(path, mime_type) with 2 args for new-style parsers - Call get_thumbnail(path, mime_type) with 2 args for new-style parsers - Call __exit__ instead of cleanup() in the finally block Co-Authored-By: Claude Sonnet 4.6 --- src/documents/tasks.py | 51 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 378695731..6f4e20c29 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -65,6 +65,10 @@ from documents.signals.handlers import run_workflows from documents.signals.handlers import send_websocket_document_updated from documents.workflows.utils import get_workflows_for_trigger from paperless.config import AIConfig +from paperless.parsers import ParserContext +from paperless.parsers.mail import MailDocumentParser +from paperless.parsers.text import TextDocumentParser +from paperless.parsers.tika import TikaDocumentParser from paperless_ai.indexing import llm_index_add_or_update_document from paperless_ai.indexing import llm_index_remove_document from paperless_ai.indexing import update_llm_index @@ -315,14 +319,37 @@ def update_document_content_maybe_archive_file(document_id) -> None: parser: DocumentParser = parser_class(logging_group=uuid.uuid4()) - try: - parser.parse(document.source_path, mime_type, document.get_public_filename()) + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if isinstance(parser, (TextDocumentParser, TikaDocumentParser)): + parser.__enter__() - thumbnail = parser.get_thumbnail( - document.source_path, - mime_type, - document.get_public_filename(), - ) + try: + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if isinstance( + parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + parser.configure(ParserContext()) + parser.parse(document.source_path, mime_type) + else: + parser.parse( + document.source_path, + mime_type, + document.get_public_filename(), + ) + + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if isinstance( + parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + thumbnail = parser.get_thumbnail(document.source_path, mime_type) + else: + thumbnail = parser.get_thumbnail( + document.source_path, + mime_type, + document.get_public_filename(), + ) with transaction.atomic(): oldDocument = Document.objects.get(pk=document.pk) @@ -403,8 +430,14 @@ def update_document_content_maybe_archive_file(document_id) -> None: f"Error while parsing document {document} (ID: {document_id})", ) finally: - # TODO(stumpylog): Cleanup once all parsers are handled - parser.cleanup() + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if isinstance( + parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + parser.__exit__(None, None, None) + else: + parser.cleanup() @shared_task