diff --git a/src/documents/checks.py b/src/documents/checks.py index b6e9e90fc..0867ef403 100644 --- a/src/documents/checks.py +++ b/src/documents/checks.py @@ -3,25 +3,20 @@ from django.core.checks import Error from django.core.checks import Warning from django.core.checks import register -from documents.signals import document_consumer_declaration from documents.templating.utils import convert_format_str_to_template_format +from paperless.parsers.registry import get_parser_registry @register() def parser_check(app_configs, **kwargs): - parsers = [] - for response in document_consumer_declaration.send(None): - parsers.append(response[1]) - - if len(parsers) == 0: + if not get_parser_registry().all_parsers(): return [ Error( "No parsers found. This is a bug. The consumer won't be " "able to consume any documents without parsers.", ), ] - else: - return [] + return [] @register() diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 81d9eb456..8cbbb9d44 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -32,7 +32,6 @@ from documents.models import DocumentType from documents.models import StoragePath from documents.models import Tag from documents.models import WorkflowTrigger -from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import get_parser_class_for_mime_type from documents.permissions import set_permissions_for_object @@ -52,40 +51,11 @@ from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess from paperless.parsers import ParserContext -from paperless.parsers.mail import MailDocumentParser -from paperless.parsers.remote import RemoteDocumentParser -from paperless.parsers.tesseract import RasterisedDocumentParser -from paperless.parsers.text import TextDocumentParser -from paperless.parsers.tika import TikaDocumentParser +from paperless.parsers import ParserProtocol LOGGING_NAME: Final[str] = "paperless.consumer" -def _parser_cleanup(parser: DocumentParser) -> None: - """ - Call cleanup on a parser, handling the new-style context-manager parsers. - - New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown - instead of a cleanup() method. This shim will be removed once all existing parsers - have switched to the new style and this consumer is updated to use it - - TODO(stumpylog): Remove me in the future - """ - if isinstance( - parser, - ( - MailDocumentParser, - RasterisedDocumentParser, - RemoteDocumentParser, - TextDocumentParser, - TikaDocumentParser, - ), - ): - parser.__exit__(None, None, None) - else: - parser.cleanup() - - class WorkflowTriggerPlugin( NoCleanupPluginMixin, NoSetupPluginMixin, @@ -422,7 +392,7 @@ class ConsumerPlugin( self.log.error(f"Error attempting to clean PDF: {e}") # Based on the mime type, get the parser for that type - parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type( + parser_class: type[ParserProtocol] | None = get_parser_class_for_mime_type( mime_type, ) if not parser_class: @@ -446,313 +416,275 @@ class ConsumerPlugin( tempdir.cleanup() raise - def progress_callback( - current_progress, - max_progress, - ) -> None: # pragma: no cover - # recalculate progress to be within 20 and 80 - p = int((current_progress / max_progress) * 50 + 20) - self._send_progress(p, 100, ProgressStatusOptions.WORKING) - # This doesn't parse the document yet, but gives us a parser. - - document_parser: DocumentParser = parser_class( - self.logging_group, - progress_callback=progress_callback, - ) - - parser_is_new_style = isinstance( - document_parser, - ( - MailDocumentParser, - RasterisedDocumentParser, - RemoteDocumentParser, - TextDocumentParser, - TikaDocumentParser, - ), - ) - - # New-style parsers use __enter__/__exit__ for resource management. - # _parser_cleanup (below) handles __exit__; call __enter__ here. - # TODO(stumpylog): Remove me in the future - if parser_is_new_style: - document_parser.__enter__() - - self.log.debug(f"Parser: {type(document_parser).__name__}") - - # Parse the document. This may take some time. - - text = None - date = None - thumbnail = None - archive_path = None - page_count = None - - try: - self._send_progress( - 20, - 100, - ProgressStatusOptions.WORKING, - ConsumerStatusShortMessage.PARSING_DOCUMENT, + with parser_class() as document_parser: + document_parser.configure( + ParserContext(mailrule_id=self.input_doc.mailrule_id), ) - self.log.debug(f"Parsing {self.filename}...") - # TODO(stumpylog): Remove me in the future when all parsers use new protocol - if parser_is_new_style: - document_parser.configure( - ParserContext(mailrule_id=self.input_doc.mailrule_id), - ) - # TODO(stumpylog): Remove me in the future - document_parser.parse(self.working_copy, mime_type) - else: - document_parser.parse(self.working_copy, mime_type, self.filename) + self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}") - self.log.debug(f"Generating thumbnail for {self.filename}...") - self._send_progress( - 70, - 100, - ProgressStatusOptions.WORKING, - ConsumerStatusShortMessage.GENERATING_THUMBNAIL, - ) - # TODO(stumpylog): Remove me in the future when all parsers use new protocol - if parser_is_new_style: - thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type) - else: - thumbnail = document_parser.get_thumbnail( - self.working_copy, - mime_type, - self.filename, - ) + # Parse the document. This may take some time. - text = document_parser.get_text() - date = document_parser.get_date() - if date is None: + text = None + date = None + thumbnail = None + archive_path = None + page_count = None + + try: self._send_progress( - 90, + 20, 100, ProgressStatusOptions.WORKING, - ConsumerStatusShortMessage.PARSE_DATE, + ConsumerStatusShortMessage.PARSING_DOCUMENT, ) - with get_date_parser() as date_parser: - date = next(date_parser.parse(self.filename, text), None) - archive_path = document_parser.get_archive_path() - page_count = document_parser.get_page_count(self.working_copy, mime_type) + self.log.debug(f"Parsing {self.filename}...") - except ParseError as e: - _parser_cleanup(document_parser) - if tempdir: - tempdir.cleanup() - self._fail( - str(e), - f"Error occurred while consuming document {self.filename}: {e}", - exc_info=True, - exception=e, - ) - except Exception as e: - _parser_cleanup(document_parser) - if tempdir: - tempdir.cleanup() - self._fail( - str(e), - f"Unexpected error while consuming document {self.filename}: {e}", - exc_info=True, - exception=e, - ) + document_parser.parse(self.working_copy, mime_type) - # Prepare the document classifier. + self.log.debug(f"Generating thumbnail for {self.filename}...") + self._send_progress( + 70, + 100, + ProgressStatusOptions.WORKING, + ConsumerStatusShortMessage.GENERATING_THUMBNAIL, + ) + thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type) - # TODO: I don't really like to do this here, but this way we avoid - # reloading the classifier multiple times, since there are multiple - # post-consume hooks that all require the classifier. - - classifier = load_classifier() - - self._send_progress( - 95, - 100, - ProgressStatusOptions.WORKING, - ConsumerStatusShortMessage.SAVE_DOCUMENT, - ) - # now that everything is done, we can start to store the document - # in the system. This will be a transaction and reasonably fast. - try: - with transaction.atomic(): - # store the document. - if self.input_doc.root_document_id: - # If this is a new version of an existing document, we need - # to make sure we're not creating a new document, but updating - # the existing one. - root_doc = Document.objects.get( - pk=self.input_doc.root_document_id, + text = document_parser.get_text() + date = document_parser.get_date() + if date is None: + self._send_progress( + 90, + 100, + ProgressStatusOptions.WORKING, + ConsumerStatusShortMessage.PARSE_DATE, ) - original_document = self._create_version_from_root( - root_doc, - text=text, - page_count=page_count, - mime_type=mime_type, - ) - actor = None + with get_date_parser() as date_parser: + date = next(date_parser.parse(self.filename, text), None) + archive_path = document_parser.get_archive_path() + page_count = document_parser.get_page_count( + self.working_copy, + mime_type, + ) - # Save the new version, potentially creating an audit log entry for the version addition if enabled. - if ( - settings.AUDIT_LOG_ENABLED - and self.metadata.actor_id is not None - ): - actor = User.objects.filter(pk=self.metadata.actor_id).first() - if actor is not None: - from auditlog.context import ( # type: ignore[import-untyped] - set_actor, - ) + except ParseError as e: + if tempdir: + tempdir.cleanup() + self._fail( + str(e), + f"Error occurred while consuming document {self.filename}: {e}", + exc_info=True, + exception=e, + ) + except Exception as e: + if tempdir: + tempdir.cleanup() + self._fail( + str(e), + f"Unexpected error while consuming document {self.filename}: {e}", + exc_info=True, + exception=e, + ) - with set_actor(actor): + # Prepare the document classifier. + + # TODO: I don't really like to do this here, but this way we avoid + # reloading the classifier multiple times, since there are multiple + # post-consume hooks that all require the classifier. + + classifier = load_classifier() + + self._send_progress( + 95, + 100, + ProgressStatusOptions.WORKING, + ConsumerStatusShortMessage.SAVE_DOCUMENT, + ) + # now that everything is done, we can start to store the document + # in the system. This will be a transaction and reasonably fast. + try: + with transaction.atomic(): + # store the document. + if self.input_doc.root_document_id: + # If this is a new version of an existing document, we need + # to make sure we're not creating a new document, but updating + # the existing one. + root_doc = Document.objects.get( + pk=self.input_doc.root_document_id, + ) + original_document = self._create_version_from_root( + root_doc, + text=text, + page_count=page_count, + mime_type=mime_type, + ) + actor = None + + # Save the new version, potentially creating an audit log entry for the version addition if enabled. + if ( + settings.AUDIT_LOG_ENABLED + and self.metadata.actor_id is not None + ): + actor = User.objects.filter( + pk=self.metadata.actor_id, + ).first() + if actor is not None: + from auditlog.context import ( # type: ignore[import-untyped] + set_actor, + ) + + with set_actor(actor): + original_document.save() + else: original_document.save() else: original_document.save() + + # Create a log entry for the version addition, if enabled + if settings.AUDIT_LOG_ENABLED: + from auditlog.models import ( # type: ignore[import-untyped] + LogEntry, + ) + + LogEntry.objects.log_create( + instance=root_doc, + changes={ + "Version Added": ["None", original_document.id], + }, + action=LogEntry.Action.UPDATE, + actor=actor, + additional_data={ + "reason": "Version added", + "version_id": original_document.id, + }, + ) + document = original_document else: - original_document.save() - - # Create a log entry for the version addition, if enabled - if settings.AUDIT_LOG_ENABLED: - from auditlog.models import ( # type: ignore[import-untyped] - LogEntry, + document = self._store( + text=text, + date=date, + page_count=page_count, + mime_type=mime_type, ) - LogEntry.objects.log_create( - instance=root_doc, - changes={ - "Version Added": ["None", original_document.id], - }, - action=LogEntry.Action.UPDATE, - actor=actor, - additional_data={ - "reason": "Version added", - "version_id": original_document.id, - }, - ) - document = original_document - else: - document = self._store( - text=text, - date=date, - page_count=page_count, - mime_type=mime_type, - ) + # If we get here, it was successful. Proceed with post-consume + # hooks. If they fail, nothing will get changed. - # If we get here, it was successful. Proceed with post-consume - # hooks. If they fail, nothing will get changed. - - document_consumption_finished.send( - sender=self.__class__, - document=document, - logging_group=self.logging_group, - classifier=classifier, - original_file=self.unmodified_original - if self.unmodified_original - else self.working_copy, - ) - - # After everything is in the database, copy the files into - # place. If this fails, we'll also rollback the transaction. - with FileLock(settings.MEDIA_LOCK): - generated_filename = generate_unique_filename(document) - if ( - len(str(generated_filename)) - > Document.MAX_STORED_FILENAME_LENGTH - ): - self.log.warning( - "Generated source filename exceeds db path limit, falling back to default naming", - ) - generated_filename = generate_filename( - document, - use_format=False, - ) - document.filename = generated_filename - create_source_path_directory(document.source_path) - - self._write( - self.unmodified_original - if self.unmodified_original is not None + document_consumption_finished.send( + sender=self.__class__, + document=document, + logging_group=self.logging_group, + classifier=classifier, + original_file=self.unmodified_original + if self.unmodified_original else self.working_copy, - document.source_path, ) - self._write( - thumbnail, - document.thumbnail_path, - ) - - if archive_path and Path(archive_path).is_file(): - generated_archive_filename = generate_unique_filename( - document, - archive_filename=True, - ) + # After everything is in the database, copy the files into + # place. If this fails, we'll also rollback the transaction. + with FileLock(settings.MEDIA_LOCK): + generated_filename = generate_unique_filename(document) if ( - len(str(generated_archive_filename)) + len(str(generated_filename)) > Document.MAX_STORED_FILENAME_LENGTH ): self.log.warning( - "Generated archive filename exceeds db path limit, falling back to default naming", + "Generated source filename exceeds db path limit, falling back to default naming", ) - generated_archive_filename = generate_filename( + generated_filename = generate_filename( document, - archive_filename=True, use_format=False, ) - document.archive_filename = generated_archive_filename - create_source_path_directory(document.archive_path) + document.filename = generated_filename + create_source_path_directory(document.source_path) + self._write( - archive_path, - document.archive_path, + self.unmodified_original + if self.unmodified_original is not None + else self.working_copy, + document.source_path, ) - with Path(archive_path).open("rb") as f: - document.archive_checksum = hashlib.md5( - f.read(), - ).hexdigest() + self._write( + thumbnail, + document.thumbnail_path, + ) - # Don't save with the lock active. Saving will cause the file - # renaming logic to acquire the lock as well. - # This triggers things like file renaming - document.save() + if archive_path and Path(archive_path).is_file(): + generated_archive_filename = generate_unique_filename( + document, + archive_filename=True, + ) + if ( + len(str(generated_archive_filename)) + > Document.MAX_STORED_FILENAME_LENGTH + ): + self.log.warning( + "Generated archive filename exceeds db path limit, falling back to default naming", + ) + generated_archive_filename = generate_filename( + document, + archive_filename=True, + use_format=False, + ) + document.archive_filename = generated_archive_filename + create_source_path_directory(document.archive_path) + self._write( + archive_path, + document.archive_path, + ) - if document.root_document_id: - document_updated.send( - sender=self.__class__, - document=document.root_document, - ) + with Path(archive_path).open("rb") as f: + document.archive_checksum = hashlib.md5( + f.read(), + ).hexdigest() - # Delete the file only if it was successfully consumed - self.log.debug(f"Deleting original file {self.input_doc.original_file}") - self.input_doc.original_file.unlink() - self.log.debug(f"Deleting working copy {self.working_copy}") - self.working_copy.unlink() - if self.unmodified_original is not None: # pragma: no cover + # Don't save with the lock active. Saving will cause the file + # renaming logic to acquire the lock as well. + # This triggers things like file renaming + document.save() + + if document.root_document_id: + document_updated.send( + sender=self.__class__, + document=document.root_document, + ) + + # Delete the file only if it was successfully consumed self.log.debug( - f"Deleting unmodified original file {self.unmodified_original}", + f"Deleting original file {self.input_doc.original_file}", ) - self.unmodified_original.unlink() + self.input_doc.original_file.unlink() + self.log.debug(f"Deleting working copy {self.working_copy}") + self.working_copy.unlink() + if self.unmodified_original is not None: # pragma: no cover + self.log.debug( + f"Deleting unmodified original file {self.unmodified_original}", + ) + self.unmodified_original.unlink() - # https://github.com/jonaswinkler/paperless-ng/discussions/1037 - shadow_file = ( - Path(self.input_doc.original_file).parent - / f"._{Path(self.input_doc.original_file).name}" + # https://github.com/jonaswinkler/paperless-ng/discussions/1037 + shadow_file = ( + Path(self.input_doc.original_file).parent + / f"._{Path(self.input_doc.original_file).name}" + ) + + if Path(shadow_file).is_file(): + self.log.debug(f"Deleting shadow file {shadow_file}") + Path(shadow_file).unlink() + + except Exception as e: + self._fail( + str(e), + f"The following error occurred while storing document " + f"{self.filename} after parsing: {e}", + exc_info=True, + exception=e, ) - - if Path(shadow_file).is_file(): - self.log.debug(f"Deleting shadow file {shadow_file}") - Path(shadow_file).unlink() - - except Exception as e: - self._fail( - str(e), - f"The following error occurred while storing document " - f"{self.filename} after parsing: {e}", - exc_info=True, - exception=e, - ) - finally: - _parser_cleanup(document_parser) - tempdir.cleanup() + finally: + tempdir.cleanup() self.run_post_consume_script(document) diff --git a/src/documents/management/commands/document_thumbnails.py b/src/documents/management/commands/document_thumbnails.py index 1756f8754..2812e0dc0 100644 --- a/src/documents/management/commands/document_thumbnails.py +++ b/src/documents/management/commands/document_thumbnails.py @@ -4,11 +4,6 @@ import shutil from documents.management.commands.base import PaperlessCommand from documents.models import Document from documents.parsers import get_parser_class_for_mime_type -from paperless.parsers.mail import MailDocumentParser -from paperless.parsers.remote import RemoteDocumentParser -from paperless.parsers.tesseract import RasterisedDocumentParser -from paperless.parsers.text import TextDocumentParser -from paperless.parsers.tika import TikaDocumentParser logger = logging.getLogger("paperless.management.thumbnails") @@ -25,40 +20,9 @@ def _process_document(doc_id: int) -> None: ) return - parser = parser_class(logging_group=None) - - parser_is_new_style = isinstance( - parser, - ( - MailDocumentParser, - RasterisedDocumentParser, - RemoteDocumentParser, - TextDocumentParser, - TikaDocumentParser, - ), - ) - - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if parser_is_new_style: - parser.__enter__() - - try: - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if parser_is_new_style: - thumb = parser.get_thumbnail(document.source_path, document.mime_type) - else: - thumb = parser.get_thumbnail( - document.source_path, - document.mime_type, - document.get_public_filename(), - ) + with parser_class() as parser: + thumb = parser.get_thumbnail(document.source_path, document.mime_type) shutil.move(thumb, document.thumbnail_path) - finally: - # TODO(stumpylog): Cleanup once all parsers are handled - if parser_is_new_style: - parser.__exit__(None, None, None) - else: - parser.cleanup() class Command(PaperlessCommand): diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 372cf0491..94233c648 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -7,20 +7,21 @@ import re import shutil import subprocess import tempfile -from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING from django.conf import settings from documents.loggers import LoggingMixin -from documents.signals import document_consumer_declaration from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.parsers.registry import get_parser_registry if TYPE_CHECKING: import datetime + from paperless.parsers import ParserProtocol + # This regular expression will try to find dates in the document at # hand and will match the following formats: # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits @@ -52,7 +53,6 @@ DATE_REGEX = re.compile( logger = logging.getLogger("paperless.parsing") -@lru_cache(maxsize=8) def is_mime_type_supported(mime_type: str) -> bool: """ Returns True if the mime type is supported, False otherwise @@ -60,27 +60,21 @@ def is_mime_type_supported(mime_type: str) -> bool: return get_parser_class_for_mime_type(mime_type) is not None -@lru_cache(maxsize=8) def get_default_file_extension(mime_type: str) -> str: """ Returns the default file extension for a mimetype, or an empty string if it could not be determined """ - for response in document_consumer_declaration.send(None): - parser_declaration = response[1] - supported_mime_types = parser_declaration["mime_types"] - - if mime_type in supported_mime_types: - return supported_mime_types[mime_type] + parser_class = get_parser_class_for_mime_type(mime_type) + if parser_class is not None: + supported = parser_class.supported_mime_types() + if mime_type in supported: + return supported[mime_type] ext = mimetypes.guess_extension(mime_type) - if ext: - return ext - else: - return "" + return ext if ext else "" -@lru_cache(maxsize=8) def is_file_ext_supported(ext: str) -> bool: """ Returns True if the file extension is supported, False otherwise @@ -94,42 +88,22 @@ def is_file_ext_supported(ext: str) -> bool: def get_supported_file_extensions() -> set[str]: extensions = set() - for response in document_consumer_declaration.send(None): - parser_declaration = response[1] - supported_mime_types = parser_declaration["mime_types"] - - for mime_type in supported_mime_types: + for parser_class in get_parser_registry().all_parsers(): + for mime_type, ext in parser_class.supported_mime_types().items(): extensions.update(mimetypes.guess_all_extensions(mime_type)) # Python's stdlib might be behind, so also add what the parser # says is the default extension # This makes image/webp supported on Python < 3.11 - extensions.add(supported_mime_types[mime_type]) + extensions.add(ext) return extensions -def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None: +def get_parser_class_for_mime_type(mime_type: str) -> type[ParserProtocol] | None: """ - Returns the best parser (by weight) for the given mimetype or - None if no parser exists + Returns the best parser for the given mimetype or None if no parser exists. """ - - options = [] - - for response in document_consumer_declaration.send(None): - parser_declaration = response[1] - supported_mime_types = parser_declaration["mime_types"] - - if mime_type in supported_mime_types: - options.append(parser_declaration) - - if not options: - return None - - best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0] - - # Return the parser with the highest weight. - return best_parser["parser"] + return get_parser_registry().get_parser_for_file(mime_type, "") def run_convert( diff --git a/src/documents/tasks.py b/src/documents/tasks.py index a8ca0cc5f..e8186a4f1 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -52,7 +52,6 @@ from documents.models import StoragePath from documents.models import Tag from documents.models import WorkflowRun from documents.models import WorkflowTrigger -from documents.parsers import DocumentParser from documents.parsers import get_parser_class_for_mime_type from documents.plugins.base import ConsumeTaskPlugin from documents.plugins.base import ProgressManager @@ -66,11 +65,6 @@ from documents.signals.handlers import send_websocket_document_updated from documents.workflows.utils import get_workflows_for_trigger from paperless.config import AIConfig from paperless.parsers import ParserContext -from paperless.parsers.mail import MailDocumentParser -from paperless.parsers.remote import RemoteDocumentParser -from paperless.parsers.tesseract import RasterisedDocumentParser -from paperless.parsers.text import TextDocumentParser -from paperless.parsers.tika import TikaDocumentParser from paperless_ai.indexing import llm_index_add_or_update_document from paperless_ai.indexing import llm_index_remove_document from paperless_ai.indexing import update_llm_index @@ -310,9 +304,7 @@ def update_document_content_maybe_archive_file(document_id) -> None: mime_type = document.mime_type - parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type( - mime_type, - ) + parser_class = get_parser_class_for_mime_type(mime_type) if not parser_class: logger.error( @@ -321,138 +313,92 @@ def update_document_content_maybe_archive_file(document_id) -> None: ) return - parser: DocumentParser = parser_class(logging_group=uuid.uuid4()) + with parser_class() as parser: + parser.configure(ParserContext()) - parser_is_new_style = isinstance( - parser, - ( - MailDocumentParser, - RasterisedDocumentParser, - RemoteDocumentParser, - TextDocumentParser, - TikaDocumentParser, - ), - ) - - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if parser_is_new_style: - parser.__enter__() - - try: - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if parser_is_new_style: - parser.configure(ParserContext()) + try: parser.parse(document.source_path, mime_type) - else: - parser.parse( - document.source_path, - mime_type, - document.get_public_filename(), - ) - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if parser_is_new_style: thumbnail = parser.get_thumbnail(document.source_path, mime_type) - else: - thumbnail = parser.get_thumbnail( - document.source_path, - mime_type, - document.get_public_filename(), - ) - with transaction.atomic(): - oldDocument = Document.objects.get(pk=document.pk) - if parser.get_archive_path(): - with Path(parser.get_archive_path()).open("rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() - # I'm going to save first so that in case the file move - # fails, the database is rolled back. - # We also don't use save() since that triggers the filehandling - # logic, and we don't want that yet (file not yet in place) - document.archive_filename = generate_unique_filename( - document, - archive_filename=True, - ) - Document.objects.filter(pk=document.pk).update( - archive_checksum=checksum, - content=parser.get_text(), - archive_filename=document.archive_filename, - ) - newDocument = Document.objects.get(pk=document.pk) - if settings.AUDIT_LOG_ENABLED: - LogEntry.objects.log_create( - instance=oldDocument, - changes={ - "content": [oldDocument.content, newDocument.content], - "archive_checksum": [ - oldDocument.archive_checksum, - newDocument.archive_checksum, - ], - "archive_filename": [ - oldDocument.archive_filename, - newDocument.archive_filename, - ], - }, - additional_data={ - "reason": "Update document content", - }, - action=LogEntry.Action.UPDATE, - ) - else: - Document.objects.filter(pk=document.pk).update( - content=parser.get_text(), - ) - - if settings.AUDIT_LOG_ENABLED: - LogEntry.objects.log_create( - instance=oldDocument, - changes={ - "content": [oldDocument.content, parser.get_text()], - }, - additional_data={ - "reason": "Update document content", - }, - action=LogEntry.Action.UPDATE, - ) - - with FileLock(settings.MEDIA_LOCK): + with transaction.atomic(): + oldDocument = Document.objects.get(pk=document.pk) if parser.get_archive_path(): - create_source_path_directory(document.archive_path) - shutil.move(parser.get_archive_path(), document.archive_path) - shutil.move(thumbnail, document.thumbnail_path) + with Path(parser.get_archive_path()).open("rb") as f: + checksum = hashlib.md5(f.read()).hexdigest() + # I'm going to save first so that in case the file move + # fails, the database is rolled back. + # We also don't use save() since that triggers the filehandling + # logic, and we don't want that yet (file not yet in place) + document.archive_filename = generate_unique_filename( + document, + archive_filename=True, + ) + Document.objects.filter(pk=document.pk).update( + archive_checksum=checksum, + content=parser.get_text(), + archive_filename=document.archive_filename, + ) + newDocument = Document.objects.get(pk=document.pk) + if settings.AUDIT_LOG_ENABLED: + LogEntry.objects.log_create( + instance=oldDocument, + changes={ + "content": [oldDocument.content, newDocument.content], + "archive_checksum": [ + oldDocument.archive_checksum, + newDocument.archive_checksum, + ], + "archive_filename": [ + oldDocument.archive_filename, + newDocument.archive_filename, + ], + }, + additional_data={ + "reason": "Update document content", + }, + action=LogEntry.Action.UPDATE, + ) + else: + Document.objects.filter(pk=document.pk).update( + content=parser.get_text(), + ) - document.refresh_from_db() - logger.info( - f"Updating index for document {document_id} ({document.archive_checksum})", - ) - with index.open_index_writer() as writer: - index.update_document(writer, document) + if settings.AUDIT_LOG_ENABLED: + LogEntry.objects.log_create( + instance=oldDocument, + changes={ + "content": [oldDocument.content, parser.get_text()], + }, + additional_data={ + "reason": "Update document content", + }, + action=LogEntry.Action.UPDATE, + ) - ai_config = AIConfig() - if ai_config.llm_index_enabled: - llm_index_add_or_update_document(document) + with FileLock(settings.MEDIA_LOCK): + if parser.get_archive_path(): + create_source_path_directory(document.archive_path) + shutil.move(parser.get_archive_path(), document.archive_path) + shutil.move(thumbnail, document.thumbnail_path) - clear_document_caches(document.pk) + document.refresh_from_db() + logger.info( + f"Updating index for document {document_id} ({document.archive_checksum})", + ) + with index.open_index_writer() as writer: + index.update_document(writer, document) - except Exception: - logger.exception( - f"Error while parsing document {document} (ID: {document_id})", - ) - finally: - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if isinstance( - parser, - ( - MailDocumentParser, - RasterisedDocumentParser, - RemoteDocumentParser, - TextDocumentParser, - TikaDocumentParser, - ), - ): - parser.__exit__(None, None, None) - else: - parser.cleanup() + ai_config = AIConfig() + if ai_config.llm_index_enabled: + llm_index_add_or_update_document(document) + + clear_document_caches(document.pk) + + except Exception: + logger.exception( + f"Error while parsing document {document} (ID: {document_id})", + ) @shared_task diff --git a/src/documents/tests/test_checks.py b/src/documents/tests/test_checks.py index b78946ba9..51d9cdddc 100644 --- a/src/documents/tests/test_checks.py +++ b/src/documents/tests/test_checks.py @@ -13,8 +13,10 @@ class TestDocumentChecks(TestCase): def test_parser_check(self) -> None: self.assertEqual(parser_check(None), []) - with mock.patch("documents.checks.document_consumer_declaration.send") as m: - m.return_value = [] + with mock.patch("documents.checks.get_parser_registry") as mock_registry_fn: + mock_registry = mock.MagicMock() + mock_registry.all_parsers.return_value = [] + mock_registry_fn.return_value = mock_registry self.assertEqual( parser_check(None), diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index a3574fdce..7ea438187 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -27,7 +27,6 @@ from documents.models import Document from documents.models import DocumentType from documents.models import StoragePath from documents.models import Tag -from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.plugins.helpers import ProgressStatusOptions from documents.tasks import sanity_check @@ -38,62 +37,104 @@ from documents.tests.utils import GetConsumerMixin from paperless_mail.models import MailRule -class _BaseTestParser(DocumentParser): - def get_settings(self) -> None: - """ - This parser does not implement additional settings yet - """ +class _BaseNewStyleParser: + """Minimal ParserProtocol implementation for use in consumer tests.""" + + name: str = "test-parser" + version: str = "0.1" + author: str = "test" + url: str = "test" + + @classmethod + def supported_mime_types(cls) -> dict: + return { + "application/pdf": ".pdf", + "image/png": ".png", + "message/rfc822": ".eml", + } + + @classmethod + def score(cls, mime_type: str, filename: str, path=None): + return 0 if mime_type in cls.supported_mime_types() else None + + @property + def can_produce_archive(self) -> bool: + return True + + @property + def requires_pdf_rendition(self) -> bool: + return False + + def __init__(self) -> None: + self._tmpdir: Path | None = None + self._text: str | None = None + self._archive: Path | None = None + self._thumb: Path | None = None + + def __enter__(self): + self._tmpdir = Path( + tempfile.mkdtemp(prefix="paperless-test-", dir=settings.SCRATCH_DIR), + ) + _, thumb = tempfile.mkstemp(suffix=".webp", dir=self._tmpdir) + self._thumb = Path(thumb) + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + if self._tmpdir and self._tmpdir.exists(): + shutil.rmtree(self._tmpdir, ignore_errors=True) + + def configure(self, context) -> None: + pass + + def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None: + raise NotImplementedError + + def get_text(self) -> str | None: + return self._text + + def get_date(self): return None + def get_archive_path(self): + return self._archive -class DummyParser(_BaseTestParser): - def __init__(self, logging_group, scratch_dir, archive_path) -> None: - super().__init__(logging_group, None) - _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) - self.archive_path = archive_path + def get_thumbnail(self, document_path, mime_type) -> Path: + return self._thumb - def get_thumbnail(self, document_path, mime_type, file_name=None): - return self.fake_thumb + def get_page_count(self, document_path, mime_type): + return None - def parse(self, document_path, mime_type, file_name=None) -> None: - self.text = "The Text" + def extract_metadata(self, document_path, mime_type) -> list: + return [] -class CopyParser(_BaseTestParser): - def get_thumbnail(self, document_path, mime_type, file_name=None): - return self.fake_thumb +class DummyParser(_BaseNewStyleParser): + _ARCHIVE_SRC = ( + Path(__file__).parent / "samples" / "documents" / "archive" / "0000001.pdf" + ) - def __init__(self, logging_group, progress_callback=None) -> None: - super().__init__(logging_group, progress_callback) - _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir) - - def parse(self, document_path, mime_type, file_name=None) -> None: - self.text = "The text" - self.archive_path = Path(self.tempdir / "archive.pdf") - shutil.copy(document_path, self.archive_path) + def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None: + self._text = "The Text" + if produce_archive and self._tmpdir: + self._archive = self._tmpdir / "archive.pdf" + shutil.copy(self._ARCHIVE_SRC, self._archive) -class FaultyParser(_BaseTestParser): - def __init__(self, logging_group, scratch_dir) -> None: - super().__init__(logging_group) - _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) +class CopyParser(_BaseNewStyleParser): + def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None: + self._text = "The text" + if produce_archive and self._tmpdir: + self._archive = self._tmpdir / "archive.pdf" + shutil.copy(document_path, self._archive) - def get_thumbnail(self, document_path, mime_type, file_name=None): - return self.fake_thumb - def parse(self, document_path, mime_type, file_name=None): +class FaultyParser(_BaseNewStyleParser): + def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None: raise ParseError("Does not compute.") -class FaultyGenericExceptionParser(_BaseTestParser): - def __init__(self, logging_group, scratch_dir) -> None: - super().__init__(logging_group) - _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) - - def get_thumbnail(self, document_path, mime_type, file_name=None): - return self.fake_thumb - - def parse(self, document_path, mime_type, file_name=None): +class FaultyGenericExceptionParser(_BaseNewStyleParser): + def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None: raise Exception("Generic exception.") @@ -147,38 +188,12 @@ class TestConsumer( self.assertEqual(payload["data"]["max_progress"], last_progress_max) self.assertEqual(payload["data"]["status"], last_status) - def make_dummy_parser(self, logging_group, progress_callback=None): - return DummyParser( - logging_group, - self.dirs.scratch_dir, - self.get_test_archive_file(), - ) - - def make_faulty_parser(self, logging_group, progress_callback=None): - return FaultyParser(logging_group, self.dirs.scratch_dir) - - def make_faulty_generic_exception_parser( - self, - logging_group, - progress_callback=None, - ): - return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir) - def setUp(self) -> None: super().setUp() - patcher = mock.patch("documents.parsers.document_consumer_declaration.send") + patcher = mock.patch("documents.consumer.get_parser_class_for_mime_type") m = patcher.start() - m.return_value = [ - ( - None, - { - "parser": self.make_dummy_parser, - "mime_types": {"application/pdf": ".pdf"}, - "weight": 0, - }, - ), - ] + m.return_value = DummyParser self.addCleanup(patcher.stop) def get_test_file(self): @@ -547,9 +562,9 @@ class TestConsumer( ) as consumer: consumer.run() - @mock.patch("documents.parsers.document_consumer_declaration.send") + @mock.patch("documents.consumer.get_parser_class_for_mime_type") def testNoParsers(self, m) -> None: - m.return_value = [] + m.return_value = None with self.assertRaisesMessage( ConsumerError, @@ -560,18 +575,9 @@ class TestConsumer( self._assert_first_last_send_progress(last_status="FAILED") - @mock.patch("documents.parsers.document_consumer_declaration.send") + @mock.patch("documents.consumer.get_parser_class_for_mime_type") def testFaultyParser(self, m) -> None: - m.return_value = [ - ( - None, - { - "parser": self.make_faulty_parser, - "mime_types": {"application/pdf": ".pdf"}, - "weight": 0, - }, - ), - ] + m.return_value = FaultyParser with self.get_consumer(self.get_test_file()) as consumer: with self.assertRaisesMessage( @@ -582,18 +588,9 @@ class TestConsumer( self._assert_first_last_send_progress(last_status="FAILED") - @mock.patch("documents.parsers.document_consumer_declaration.send") + @mock.patch("documents.consumer.get_parser_class_for_mime_type") def testGenericParserException(self, m) -> None: - m.return_value = [ - ( - None, - { - "parser": self.make_faulty_generic_exception_parser, - "mime_types": {"application/pdf": ".pdf"}, - "weight": 0, - }, - ), - ] + m.return_value = FaultyGenericExceptionParser with self.get_consumer(self.get_test_file()) as consumer: with self.assertRaisesMessage( @@ -1017,7 +1014,7 @@ class TestConsumer( self._assert_first_last_send_progress() @override_settings(FILENAME_FORMAT="{title}") - @mock.patch("documents.parsers.document_consumer_declaration.send") + @mock.patch("documents.consumer.get_parser_class_for_mime_type") def test_similar_filenames(self, m) -> None: shutil.copy( Path(__file__).parent / "samples" / "simple.pdf", @@ -1031,16 +1028,7 @@ class TestConsumer( Path(__file__).parent / "samples" / "simple-noalpha.png", settings.CONSUMPTION_DIR / "simple.png.pdf", ) - m.return_value = [ - ( - None, - { - "parser": CopyParser, - "mime_types": {"application/pdf": ".pdf", "image/png": ".png"}, - "weight": 0, - }, - ), - ] + m.return_value = CopyParser with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer: consumer.run() @@ -1068,8 +1056,9 @@ class TestConsumer( sanity_check() + @mock.patch("documents.consumer.get_parser_class_for_mime_type", return_value=None) @mock.patch("documents.consumer.run_subprocess") - def test_try_to_clean_invalid_pdf(self, m) -> None: + def test_try_to_clean_invalid_pdf(self, m, _) -> None: shutil.copy( Path(__file__).parent / "samples" / "invalid_pdf.pdf", settings.CONSUMPTION_DIR / "invalid_pdf.pdf", @@ -1091,10 +1080,10 @@ class TestConsumer( @mock.patch("paperless_mail.models.MailRule.objects.get") @mock.patch("paperless.parsers.mail.MailDocumentParser.parse") - @mock.patch("documents.parsers.document_consumer_declaration.send") + @mock.patch("documents.consumer.get_parser_class_for_mime_type") def test_mail_parser_receives_mailrule( self, - mock_consumer_declaration_send: mock.Mock, + mock_get_parser_class: mock.Mock, mock_mail_parser_parse: mock.Mock, mock_mailrule_get: mock.Mock, ) -> None: @@ -1106,18 +1095,9 @@ class TestConsumer( THEN: - The mail parser should receive the mail rule """ - from paperless_mail.signals import get_parser as mail_get_parser + from paperless.parsers.mail import MailDocumentParser - mock_consumer_declaration_send.return_value = [ - ( - None, - { - "parser": mail_get_parser, - "mime_types": {"message/rfc822": ".eml"}, - "weight": 0, - }, - ), - ] + mock_get_parser_class.return_value = MailDocumentParser mock_mailrule_get.return_value = mock.Mock( pdf_layout=MailRule.PdfLayout.HTML_ONLY, ) diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 5ea1b361e..fbd88adf0 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -1,7 +1,5 @@ -from tempfile import TemporaryDirectory from unittest import mock -from django.apps import apps from django.test import TestCase from django.test import override_settings @@ -9,14 +7,26 @@ from documents.parsers import get_default_file_extension from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_supported_file_extensions from documents.parsers import is_file_ext_supported +from paperless.parsers.registry import reset_parser_registry from paperless.parsers.tesseract import RasterisedDocumentParser from paperless.parsers.text import TextDocumentParser from paperless.parsers.tika import TikaDocumentParser class TestParserDiscovery(TestCase): - @mock.patch("documents.parsers.document_consumer_declaration.send") - def test_get_parser_class_1_parser(self, m, *args) -> None: + def _mock_registry(self, return_value): + """Return a context manager that patches get_parser_registry.""" + mock_registry = mock.MagicMock() + mock_registry.get_parser_for_file.return_value = return_value + mock_registry.all_parsers.return_value = ( + [return_value] if return_value is not None else [] + ) + return mock.patch( + "documents.parsers.get_parser_registry", + return_value=mock_registry, + ) + + def test_get_parser_class_1_parser(self) -> None: """ GIVEN: - Parser declared for a given mimetype @@ -29,63 +39,33 @@ class TestParserDiscovery(TestCase): class DummyParser: pass - m.return_value = ( - ( - None, - { - "weight": 0, - "parser": DummyParser, - "mime_types": {"application/pdf": ".pdf"}, - }, - ), - ) + with self._mock_registry(DummyParser): + self.assertEqual( + get_parser_class_for_mime_type("application/pdf"), + DummyParser, + ) - self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser) - - @mock.patch("documents.parsers.document_consumer_declaration.send") - def test_get_parser_class_n_parsers(self, m, *args) -> None: + def test_get_parser_class_n_parsers(self) -> None: """ GIVEN: - - Two parsers declared for a given mimetype - - Second parser has a higher weight + - Two parsers declared for a given mimetype, second has higher score WHEN: - Attempt to get parser for the mimetype THEN: - - Second parser class is returned + - Second (higher-score) parser class is returned """ - class DummyParser1: - pass - class DummyParser2: pass - m.return_value = ( - ( - None, - { - "weight": 0, - "parser": DummyParser1, - "mime_types": {"application/pdf": ".pdf"}, - }, - ), - ( - None, - { - "weight": 1, - "parser": DummyParser2, - "mime_types": {"application/pdf": ".pdf"}, - }, - ), - ) + # The registry already handles scoring; get_parser_for_file returns the winner. + with self._mock_registry(DummyParser2): + self.assertEqual( + get_parser_class_for_mime_type("application/pdf"), + DummyParser2, + ) - self.assertEqual( - get_parser_class_for_mime_type("application/pdf"), - DummyParser2, - ) - - @mock.patch("documents.parsers.document_consumer_declaration.send") - def test_get_parser_class_0_parsers(self, m, *args) -> None: + def test_get_parser_class_0_parsers(self) -> None: """ GIVEN: - No parsers are declared @@ -94,37 +74,20 @@ class TestParserDiscovery(TestCase): THEN: - No parser class is returned """ - m.return_value = [] - with TemporaryDirectory(): + with self._mock_registry(None): self.assertIsNone(get_parser_class_for_mime_type("application/pdf")) - @mock.patch("documents.parsers.document_consumer_declaration.send") - def test_get_parser_class_no_valid_parser(self, m, *args) -> None: + def test_get_parser_class_no_valid_parser(self) -> None: """ GIVEN: - No parser declared for a given mimetype - - Parser declared for a different mimetype WHEN: - Attempt to get parser for the given mimetype THEN: - No parser class is returned """ - - class DummyParser: - pass - - m.return_value = ( - ( - None, - { - "weight": 0, - "parser": DummyParser, - "mime_types": {"application/pdf": ".pdf"}, - }, - ), - ) - - self.assertIsNone(get_parser_class_for_mime_type("image/tiff")) + with self._mock_registry(None): + self.assertIsNone(get_parser_class_for_mime_type("image/tiff")) class TestParserAvailability(TestCase): @@ -151,7 +114,7 @@ class TestParserAvailability(TestCase): self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) self.assertIsInstance( - get_parser_class_for_mime_type(mime_type)(logging_group=None), + get_parser_class_for_mime_type(mime_type)(), RasterisedDocumentParser, ) @@ -175,7 +138,7 @@ class TestParserAvailability(TestCase): self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) self.assertIsInstance( - get_parser_class_for_mime_type(mime_type)(logging_group=None), + get_parser_class_for_mime_type(mime_type)(), TextDocumentParser, ) @@ -198,19 +161,20 @@ class TestParserAvailability(TestCase): ), ] - # Force the app ready to notice the settings override - with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]): - app = apps.get_app_config("paperless_tika") - app.ready() + self.addCleanup(reset_parser_registry) + + # Reset and rebuild the registry with Tika enabled. + with override_settings(TIKA_ENABLED=True): + reset_parser_registry() supported_exts = get_supported_file_extensions() - for mime_type, ext in supported_mimes_and_exts: - self.assertIn(ext, supported_exts) - self.assertEqual(get_default_file_extension(mime_type), ext) - self.assertIsInstance( - get_parser_class_for_mime_type(mime_type)(logging_group=None), - TikaDocumentParser, - ) + for mime_type, ext in supported_mimes_and_exts: + self.assertIn(ext, supported_exts) + self.assertEqual(get_default_file_extension(mime_type), ext) + self.assertIsInstance( + get_parser_class_for_mime_type(mime_type)(), + TikaDocumentParser, + ) def test_no_parser_for_mime(self) -> None: self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf")) diff --git a/src/documents/views.py b/src/documents/views.py index 91fb0c1ff..91568673b 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -7,7 +7,6 @@ import tempfile import zipfile from collections import defaultdict from collections import deque -from contextlib import nullcontext from datetime import datetime from pathlib import Path from time import mktime @@ -226,7 +225,6 @@ from paperless.celery import app as celery_app from paperless.config import AIConfig from paperless.config import GeneralConfig from paperless.models import ApplicationConfiguration -from paperless.parsers import ParserProtocol from paperless.serialisers import GroupSerializer from paperless.serialisers import UserSerializer from paperless.views import StandardPagination @@ -1085,15 +1083,11 @@ class DocumentViewSet( parser_class = get_parser_class_for_mime_type(mime_type) if parser_class: - parser = parser_class(progress_callback=None, logging_group=None) - cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser) - try: - with cm: + with parser_class() as parser: return parser.extract_metadata(file, mime_type) except Exception: # pragma: no cover logger.exception(f"Issue getting metadata for {file}") - # TODO: cover GPG errors, remove later. return [] else: # pragma: no cover logger.warning(f"No parser for {mime_type}") diff --git a/src/paperless/parsers/registry.py b/src/paperless/parsers/registry.py index 7effe554f..3c4740216 100644 --- a/src/paperless/parsers/registry.py +++ b/src/paperless/parsers/registry.py @@ -304,6 +304,23 @@ class ParserRegistry: getattr(cls, "url", "unknown"), ) + # ------------------------------------------------------------------ + # Inspection helpers + # ------------------------------------------------------------------ + + def all_parsers(self) -> list[type[ParserProtocol]]: + """Return all registered parser classes (external first, then builtins). + + Used by compatibility wrappers that need to iterate every parser to + compute the full set of supported MIME types and file extensions. + + Returns + ------- + list[type[ParserProtocol]] + External parsers followed by built-in parsers. + """ + return [*self._external, *self._builtins] + # ------------------------------------------------------------------ # Parser resolution # ------------------------------------------------------------------