mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-19 15:35:57 +00:00
Feat(parsers): call configure(ParserContext()) in update_document task
Apply the same new-style parser shim pattern as the consumer to update_document_content_maybe_archive_file: - Call __enter__ for Text/Tika parsers after instantiation - Call configure(ParserContext()) before parse() for all new-style parsers (mailrule_id is not available here — this is a re-process of an existing document, so the default empty context is correct) - Call parse(path, mime_type) with 2 args for new-style parsers - Call get_thumbnail(path, mime_type) with 2 args for new-style parsers - Call __exit__ instead of cleanup() in the finally block Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -65,6 +65,10 @@ from documents.signals.handlers import run_workflows
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
from paperless_ai.indexing import llm_index_remove_document
|
||||
from paperless_ai.indexing import update_llm_index
|
||||
@@ -315,14 +319,37 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
|
||||
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
|
||||
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type, document.get_public_filename())
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
|
||||
parser.__enter__()
|
||||
|
||||
thumbnail = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
try:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if isinstance(
|
||||
parser,
|
||||
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
|
||||
):
|
||||
parser.configure(ParserContext())
|
||||
parser.parse(document.source_path, mime_type)
|
||||
else:
|
||||
parser.parse(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if isinstance(
|
||||
parser,
|
||||
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
|
||||
):
|
||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||
else:
|
||||
thumbnail = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
@@ -403,8 +430,14 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
f"Error while parsing document {document} (ID: {document_id})",
|
||||
)
|
||||
finally:
|
||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
||||
parser.cleanup()
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if isinstance(
|
||||
parser,
|
||||
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
|
||||
):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
@shared_task
|
||||
|
||||
Reference in New Issue
Block a user