Feat(parsers): call configure(ParserContext()) in update_document task

Apply the same new-style parser shim pattern as the consumer to update_document_content_maybe_archive_file: - Call __enter__ for Text/Tika parsers after instantiation - Call configure(ParserContext()) before parse() for all new-style parsers (mailrule_id is not available here — this is a re-process of an existing document, so the default empty context is correct) - Call parse(path, mime_type) with 2 args for new-style parsers - Call get_thumbnail(path, mime_type) with 2 args for new-style parsers - Call __exit__ instead of cleanup() in the finally block Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 05:35:24 +00:00 · 2026-03-19 08:28:17 -07:00
parent a36b6ecbef
commit 8148f2ced2
1 changed files with 42 additions and 9 deletions
@@ -65,6 +65,10 @@ from documents.signals.handlers import run_workflows
 from documents.signals.handlers import send_websocket_document_updated
 from documents.workflows.utils import get_workflows_for_trigger
 from paperless.config import AIConfig
+from paperless.parsers import ParserContext
+from paperless.parsers.mail import MailDocumentParser
+from paperless.parsers.text import TextDocumentParser
+from paperless.parsers.tika import TikaDocumentParser
 from paperless_ai.indexing import llm_index_add_or_update_document
 from paperless_ai.indexing import llm_index_remove_document
 from paperless_ai.indexing import update_llm_index
@@ -315,14 +319,37 @@ def update_document_content_maybe_archive_file(document_id) -> None:

    parser: DocumentParser = parser_class(logging_group=uuid.uuid4())

-    try:
-        parser.parse(document.source_path, mime_type, document.get_public_filename())
+    # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
+    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
+        parser.__enter__()

-        thumbnail = parser.get_thumbnail(
-            document.source_path,
-            mime_type,
-            document.get_public_filename(),
-        )
+    try:
+        # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
+        if isinstance(
+            parser,
+            (MailDocumentParser, TextDocumentParser, TikaDocumentParser),
+        ):
+            parser.configure(ParserContext())
+            parser.parse(document.source_path, mime_type)
+        else:
+            parser.parse(
+                document.source_path,
+                mime_type,
+                document.get_public_filename(),
+            )
+
+        # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
+        if isinstance(
+            parser,
+            (MailDocumentParser, TextDocumentParser, TikaDocumentParser),
+        ):
+            thumbnail = parser.get_thumbnail(document.source_path, mime_type)
+        else:
+            thumbnail = parser.get_thumbnail(
+                document.source_path,
+                mime_type,
+                document.get_public_filename(),
+            )

        with transaction.atomic():
            oldDocument = Document.objects.get(pk=document.pk)
@@ -403,8 +430,14 @@ def update_document_content_maybe_archive_file(document_id) -> None:
            f"Error while parsing document {document} (ID: {document_id})",
        )
    finally:
-        # TODO(stumpylog): Cleanup once all parsers are handled
-        parser.cleanup()
+        # TODO(stumpylog): Remove branch in the future when all parsers use new protocol
+        if isinstance(
+            parser,
+            (MailDocumentParser, TextDocumentParser, TikaDocumentParser),
+        ):
+            parser.__exit__(None, None, None)
+        else:
+            parser.cleanup()


@shared_task