Feat(parsers): call configure(ParserContext()) in update_document task

Apply the same new-style parser shim pattern as the consumer to
update_document_content_maybe_archive_file:

- Call __enter__ for Text/Tika parsers after instantiation
- Call configure(ParserContext()) before parse() for all new-style parsers
  (mailrule_id is not available here — this is a re-process of an
  existing document, so the default empty context is correct)
- Call parse(path, mime_type) with 2 args for new-style parsers
- Call get_thumbnail(path, mime_type) with 2 args for new-style parsers
- Call __exit__ instead of cleanup() in the finally block

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-19 08:28:17 -07:00
parent a36b6ecbef
commit 8148f2ced2

View File

@@ -65,6 +65,10 @@ from documents.signals.handlers import run_workflows
from documents.signals.handlers import send_websocket_document_updated
from documents.workflows.utils import get_workflows_for_trigger
from paperless.config import AIConfig
from paperless.parsers import ParserContext
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
from paperless_ai.indexing import llm_index_add_or_update_document
from paperless_ai.indexing import llm_index_remove_document
from paperless_ai.indexing import update_llm_index
@@ -315,14 +319,37 @@ def update_document_content_maybe_archive_file(document_id) -> None:
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
parser.__enter__()
thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
)
try:
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
if isinstance(
parser,
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
):
parser.configure(ParserContext())
parser.parse(document.source_path, mime_type)
else:
parser.parse(
document.source_path,
mime_type,
document.get_public_filename(),
)
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
if isinstance(
parser,
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
):
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
else:
thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
)
with transaction.atomic():
oldDocument = Document.objects.get(pk=document.pk)
@@ -403,8 +430,14 @@ def update_document_content_maybe_archive_file(document_id) -> None:
f"Error while parsing document {document} (ID: {document_id})",
)
finally:
# TODO(stumpylog): Cleanup once all parsers are handled
parser.cleanup()
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
if isinstance(
parser,
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
):
parser.__exit__(None, None, None)
else:
parser.cleanup()
@shared_task