diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 378695731..6f4e20c29 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -65,6 +65,10 @@ from documents.signals.handlers import run_workflows from documents.signals.handlers import send_websocket_document_updated from documents.workflows.utils import get_workflows_for_trigger from paperless.config import AIConfig +from paperless.parsers import ParserContext +from paperless.parsers.mail import MailDocumentParser +from paperless.parsers.text import TextDocumentParser +from paperless.parsers.tika import TikaDocumentParser from paperless_ai.indexing import llm_index_add_or_update_document from paperless_ai.indexing import llm_index_remove_document from paperless_ai.indexing import update_llm_index @@ -315,14 +319,37 @@ def update_document_content_maybe_archive_file(document_id) -> None: parser: DocumentParser = parser_class(logging_group=uuid.uuid4()) - try: - parser.parse(document.source_path, mime_type, document.get_public_filename()) + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if isinstance(parser, (TextDocumentParser, TikaDocumentParser)): + parser.__enter__() - thumbnail = parser.get_thumbnail( - document.source_path, - mime_type, - document.get_public_filename(), - ) + try: + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if isinstance( + parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + parser.configure(ParserContext()) + parser.parse(document.source_path, mime_type) + else: + parser.parse( + document.source_path, + mime_type, + document.get_public_filename(), + ) + + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if isinstance( + parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + thumbnail = parser.get_thumbnail(document.source_path, mime_type) + else: + thumbnail = parser.get_thumbnail( + document.source_path, + mime_type, + document.get_public_filename(), + ) with transaction.atomic(): oldDocument = Document.objects.get(pk=document.pk) @@ -403,8 +430,14 @@ def update_document_content_maybe_archive_file(document_id) -> None: f"Error while parsing document {document} (ID: {document_id})", ) finally: - # TODO(stumpylog): Cleanup once all parsers are handled - parser.cleanup() + # TODO(stumpylog): Remove branch in the future when all parsers use new protocol + if isinstance( + parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + parser.__exit__(None, None, None) + else: + parser.cleanup() @shared_task