diff --git a/.mypy-baseline.txt b/.mypy-baseline.txt index 2700bfc71..61ffe9c10 100644 --- a/.mypy-baseline.txt +++ b/.mypy-baseline.txt @@ -2437,17 +2437,3 @@ src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "Non src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr] src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr] src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "user_args" [union-attr] -src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def] -src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def] -src/paperless_text/parsers.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "None") [assignment] -src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def] -src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def] -src/paperless_tika/parsers.py:0: error: Argument 1 to "make_thumbnail_from_pdf" has incompatible type "None"; expected "Path" [arg-type] -src/paperless_tika/parsers.py:0: error: Function is missing a return type annotation [no-untyped-def] -src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def] -src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def] -src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def] -src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def] -src/paperless_tika/parsers.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "None") [assignment] -src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def] -src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def] diff --git a/pyproject.toml b/pyproject.toml index f2a20ac47..ee89ae4dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -269,10 +269,6 @@ testpaths = [ "src/documents/tests/", "src/paperless/tests/", "src/paperless_mail/tests/", - "src/paperless_tesseract/tests/", - "src/paperless_tika/tests", - "src/paperless_text/tests/", - "src/paperless_remote/tests/", "src/paperless_ai/tests", ] diff --git a/src/documents/checks.py b/src/documents/checks.py index b6e9e90fc..0867ef403 100644 --- a/src/documents/checks.py +++ b/src/documents/checks.py @@ -3,25 +3,20 @@ from django.core.checks import Error from django.core.checks import Warning from django.core.checks import register -from documents.signals import document_consumer_declaration from documents.templating.utils import convert_format_str_to_template_format +from paperless.parsers.registry import get_parser_registry @register() def parser_check(app_configs, **kwargs): - parsers = [] - for response in document_consumer_declaration.send(None): - parsers.append(response[1]) - - if len(parsers) == 0: + if not get_parser_registry().all_parsers(): return [ Error( "No parsers found. This is a bug. The consumer won't be " "able to consume any documents without parsers.", ), ] - else: - return [] + return [] @register() diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 81d9eb456..809d6c647 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -32,9 +32,7 @@ from documents.models import DocumentType from documents.models import StoragePath from documents.models import Tag from documents.models import WorkflowTrigger -from documents.parsers import DocumentParser from documents.parsers import ParseError -from documents.parsers import get_parser_class_for_mime_type from documents.permissions import set_permissions_for_object from documents.plugins.base import AlwaysRunPluginMixin from documents.plugins.base import ConsumeTaskPlugin @@ -52,40 +50,12 @@ from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess from paperless.parsers import ParserContext -from paperless.parsers.mail import MailDocumentParser -from paperless.parsers.remote import RemoteDocumentParser -from paperless.parsers.tesseract import RasterisedDocumentParser -from paperless.parsers.text import TextDocumentParser -from paperless.parsers.tika import TikaDocumentParser +from paperless.parsers import ParserProtocol +from paperless.parsers.registry import get_parser_registry LOGGING_NAME: Final[str] = "paperless.consumer" -def _parser_cleanup(parser: DocumentParser) -> None: - """ - Call cleanup on a parser, handling the new-style context-manager parsers. - - New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown - instead of a cleanup() method. This shim will be removed once all existing parsers - have switched to the new style and this consumer is updated to use it - - TODO(stumpylog): Remove me in the future - """ - if isinstance( - parser, - ( - MailDocumentParser, - RasterisedDocumentParser, - RemoteDocumentParser, - TextDocumentParser, - TikaDocumentParser, - ), - ): - parser.__exit__(None, None, None) - else: - parser.cleanup() - - class WorkflowTriggerPlugin( NoCleanupPluginMixin, NoSetupPluginMixin, @@ -422,8 +392,12 @@ class ConsumerPlugin( self.log.error(f"Error attempting to clean PDF: {e}") # Based on the mime type, get the parser for that type - parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type( - mime_type, + parser_class: type[ParserProtocol] | None = ( + get_parser_registry().get_parser_for_file( + mime_type, + self.filename, + self.working_copy, + ) ) if not parser_class: tempdir.cleanup() @@ -446,313 +420,275 @@ class ConsumerPlugin( tempdir.cleanup() raise - def progress_callback( - current_progress, - max_progress, - ) -> None: # pragma: no cover - # recalculate progress to be within 20 and 80 - p = int((current_progress / max_progress) * 50 + 20) - self._send_progress(p, 100, ProgressStatusOptions.WORKING) - # This doesn't parse the document yet, but gives us a parser. - - document_parser: DocumentParser = parser_class( - self.logging_group, - progress_callback=progress_callback, - ) - - parser_is_new_style = isinstance( - document_parser, - ( - MailDocumentParser, - RasterisedDocumentParser, - RemoteDocumentParser, - TextDocumentParser, - TikaDocumentParser, - ), - ) - - # New-style parsers use __enter__/__exit__ for resource management. - # _parser_cleanup (below) handles __exit__; call __enter__ here. - # TODO(stumpylog): Remove me in the future - if parser_is_new_style: - document_parser.__enter__() - - self.log.debug(f"Parser: {type(document_parser).__name__}") - - # Parse the document. This may take some time. - - text = None - date = None - thumbnail = None - archive_path = None - page_count = None - - try: - self._send_progress( - 20, - 100, - ProgressStatusOptions.WORKING, - ConsumerStatusShortMessage.PARSING_DOCUMENT, + with parser_class() as document_parser: + document_parser.configure( + ParserContext(mailrule_id=self.input_doc.mailrule_id), ) - self.log.debug(f"Parsing {self.filename}...") - # TODO(stumpylog): Remove me in the future when all parsers use new protocol - if parser_is_new_style: - document_parser.configure( - ParserContext(mailrule_id=self.input_doc.mailrule_id), - ) - # TODO(stumpylog): Remove me in the future - document_parser.parse(self.working_copy, mime_type) - else: - document_parser.parse(self.working_copy, mime_type, self.filename) + self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}") - self.log.debug(f"Generating thumbnail for {self.filename}...") - self._send_progress( - 70, - 100, - ProgressStatusOptions.WORKING, - ConsumerStatusShortMessage.GENERATING_THUMBNAIL, - ) - # TODO(stumpylog): Remove me in the future when all parsers use new protocol - if parser_is_new_style: - thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type) - else: - thumbnail = document_parser.get_thumbnail( - self.working_copy, - mime_type, - self.filename, - ) + # Parse the document. This may take some time. - text = document_parser.get_text() - date = document_parser.get_date() - if date is None: + text = None + date = None + thumbnail = None + archive_path = None + page_count = None + + try: self._send_progress( - 90, + 20, 100, ProgressStatusOptions.WORKING, - ConsumerStatusShortMessage.PARSE_DATE, + ConsumerStatusShortMessage.PARSING_DOCUMENT, ) - with get_date_parser() as date_parser: - date = next(date_parser.parse(self.filename, text), None) - archive_path = document_parser.get_archive_path() - page_count = document_parser.get_page_count(self.working_copy, mime_type) + self.log.debug(f"Parsing {self.filename}...") - except ParseError as e: - _parser_cleanup(document_parser) - if tempdir: - tempdir.cleanup() - self._fail( - str(e), - f"Error occurred while consuming document {self.filename}: {e}", - exc_info=True, - exception=e, - ) - except Exception as e: - _parser_cleanup(document_parser) - if tempdir: - tempdir.cleanup() - self._fail( - str(e), - f"Unexpected error while consuming document {self.filename}: {e}", - exc_info=True, - exception=e, - ) + document_parser.parse(self.working_copy, mime_type) - # Prepare the document classifier. + self.log.debug(f"Generating thumbnail for {self.filename}...") + self._send_progress( + 70, + 100, + ProgressStatusOptions.WORKING, + ConsumerStatusShortMessage.GENERATING_THUMBNAIL, + ) + thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type) - # TODO: I don't really like to do this here, but this way we avoid - # reloading the classifier multiple times, since there are multiple - # post-consume hooks that all require the classifier. - - classifier = load_classifier() - - self._send_progress( - 95, - 100, - ProgressStatusOptions.WORKING, - ConsumerStatusShortMessage.SAVE_DOCUMENT, - ) - # now that everything is done, we can start to store the document - # in the system. This will be a transaction and reasonably fast. - try: - with transaction.atomic(): - # store the document. - if self.input_doc.root_document_id: - # If this is a new version of an existing document, we need - # to make sure we're not creating a new document, but updating - # the existing one. - root_doc = Document.objects.get( - pk=self.input_doc.root_document_id, + text = document_parser.get_text() + date = document_parser.get_date() + if date is None: + self._send_progress( + 90, + 100, + ProgressStatusOptions.WORKING, + ConsumerStatusShortMessage.PARSE_DATE, ) - original_document = self._create_version_from_root( - root_doc, - text=text, - page_count=page_count, - mime_type=mime_type, - ) - actor = None + with get_date_parser() as date_parser: + date = next(date_parser.parse(self.filename, text), None) + archive_path = document_parser.get_archive_path() + page_count = document_parser.get_page_count( + self.working_copy, + mime_type, + ) - # Save the new version, potentially creating an audit log entry for the version addition if enabled. - if ( - settings.AUDIT_LOG_ENABLED - and self.metadata.actor_id is not None - ): - actor = User.objects.filter(pk=self.metadata.actor_id).first() - if actor is not None: - from auditlog.context import ( # type: ignore[import-untyped] - set_actor, - ) + except ParseError as e: + if tempdir: + tempdir.cleanup() + self._fail( + str(e), + f"Error occurred while consuming document {self.filename}: {e}", + exc_info=True, + exception=e, + ) + except Exception as e: + if tempdir: + tempdir.cleanup() + self._fail( + str(e), + f"Unexpected error while consuming document {self.filename}: {e}", + exc_info=True, + exception=e, + ) - with set_actor(actor): + # Prepare the document classifier. + + # TODO: I don't really like to do this here, but this way we avoid + # reloading the classifier multiple times, since there are multiple + # post-consume hooks that all require the classifier. + + classifier = load_classifier() + + self._send_progress( + 95, + 100, + ProgressStatusOptions.WORKING, + ConsumerStatusShortMessage.SAVE_DOCUMENT, + ) + # now that everything is done, we can start to store the document + # in the system. This will be a transaction and reasonably fast. + try: + with transaction.atomic(): + # store the document. + if self.input_doc.root_document_id: + # If this is a new version of an existing document, we need + # to make sure we're not creating a new document, but updating + # the existing one. + root_doc = Document.objects.get( + pk=self.input_doc.root_document_id, + ) + original_document = self._create_version_from_root( + root_doc, + text=text, + page_count=page_count, + mime_type=mime_type, + ) + actor = None + + # Save the new version, potentially creating an audit log entry for the version addition if enabled. + if ( + settings.AUDIT_LOG_ENABLED + and self.metadata.actor_id is not None + ): + actor = User.objects.filter( + pk=self.metadata.actor_id, + ).first() + if actor is not None: + from auditlog.context import ( # type: ignore[import-untyped] + set_actor, + ) + + with set_actor(actor): + original_document.save() + else: original_document.save() else: original_document.save() + + # Create a log entry for the version addition, if enabled + if settings.AUDIT_LOG_ENABLED: + from auditlog.models import ( # type: ignore[import-untyped] + LogEntry, + ) + + LogEntry.objects.log_create( + instance=root_doc, + changes={ + "Version Added": ["None", original_document.id], + }, + action=LogEntry.Action.UPDATE, + actor=actor, + additional_data={ + "reason": "Version added", + "version_id": original_document.id, + }, + ) + document = original_document else: - original_document.save() - - # Create a log entry for the version addition, if enabled - if settings.AUDIT_LOG_ENABLED: - from auditlog.models import ( # type: ignore[import-untyped] - LogEntry, + document = self._store( + text=text, + date=date, + page_count=page_count, + mime_type=mime_type, ) - LogEntry.objects.log_create( - instance=root_doc, - changes={ - "Version Added": ["None", original_document.id], - }, - action=LogEntry.Action.UPDATE, - actor=actor, - additional_data={ - "reason": "Version added", - "version_id": original_document.id, - }, - ) - document = original_document - else: - document = self._store( - text=text, - date=date, - page_count=page_count, - mime_type=mime_type, - ) + # If we get here, it was successful. Proceed with post-consume + # hooks. If they fail, nothing will get changed. - # If we get here, it was successful. Proceed with post-consume - # hooks. If they fail, nothing will get changed. - - document_consumption_finished.send( - sender=self.__class__, - document=document, - logging_group=self.logging_group, - classifier=classifier, - original_file=self.unmodified_original - if self.unmodified_original - else self.working_copy, - ) - - # After everything is in the database, copy the files into - # place. If this fails, we'll also rollback the transaction. - with FileLock(settings.MEDIA_LOCK): - generated_filename = generate_unique_filename(document) - if ( - len(str(generated_filename)) - > Document.MAX_STORED_FILENAME_LENGTH - ): - self.log.warning( - "Generated source filename exceeds db path limit, falling back to default naming", - ) - generated_filename = generate_filename( - document, - use_format=False, - ) - document.filename = generated_filename - create_source_path_directory(document.source_path) - - self._write( - self.unmodified_original - if self.unmodified_original is not None + document_consumption_finished.send( + sender=self.__class__, + document=document, + logging_group=self.logging_group, + classifier=classifier, + original_file=self.unmodified_original + if self.unmodified_original else self.working_copy, - document.source_path, ) - self._write( - thumbnail, - document.thumbnail_path, - ) - - if archive_path and Path(archive_path).is_file(): - generated_archive_filename = generate_unique_filename( - document, - archive_filename=True, - ) + # After everything is in the database, copy the files into + # place. If this fails, we'll also rollback the transaction. + with FileLock(settings.MEDIA_LOCK): + generated_filename = generate_unique_filename(document) if ( - len(str(generated_archive_filename)) + len(str(generated_filename)) > Document.MAX_STORED_FILENAME_LENGTH ): self.log.warning( - "Generated archive filename exceeds db path limit, falling back to default naming", + "Generated source filename exceeds db path limit, falling back to default naming", ) - generated_archive_filename = generate_filename( + generated_filename = generate_filename( document, - archive_filename=True, use_format=False, ) - document.archive_filename = generated_archive_filename - create_source_path_directory(document.archive_path) + document.filename = generated_filename + create_source_path_directory(document.source_path) + self._write( - archive_path, - document.archive_path, + self.unmodified_original + if self.unmodified_original is not None + else self.working_copy, + document.source_path, ) - with Path(archive_path).open("rb") as f: - document.archive_checksum = hashlib.md5( - f.read(), - ).hexdigest() + self._write( + thumbnail, + document.thumbnail_path, + ) - # Don't save with the lock active. Saving will cause the file - # renaming logic to acquire the lock as well. - # This triggers things like file renaming - document.save() + if archive_path and Path(archive_path).is_file(): + generated_archive_filename = generate_unique_filename( + document, + archive_filename=True, + ) + if ( + len(str(generated_archive_filename)) + > Document.MAX_STORED_FILENAME_LENGTH + ): + self.log.warning( + "Generated archive filename exceeds db path limit, falling back to default naming", + ) + generated_archive_filename = generate_filename( + document, + archive_filename=True, + use_format=False, + ) + document.archive_filename = generated_archive_filename + create_source_path_directory(document.archive_path) + self._write( + archive_path, + document.archive_path, + ) - if document.root_document_id: - document_updated.send( - sender=self.__class__, - document=document.root_document, - ) + with Path(archive_path).open("rb") as f: + document.archive_checksum = hashlib.md5( + f.read(), + ).hexdigest() - # Delete the file only if it was successfully consumed - self.log.debug(f"Deleting original file {self.input_doc.original_file}") - self.input_doc.original_file.unlink() - self.log.debug(f"Deleting working copy {self.working_copy}") - self.working_copy.unlink() - if self.unmodified_original is not None: # pragma: no cover + # Don't save with the lock active. Saving will cause the file + # renaming logic to acquire the lock as well. + # This triggers things like file renaming + document.save() + + if document.root_document_id: + document_updated.send( + sender=self.__class__, + document=document.root_document, + ) + + # Delete the file only if it was successfully consumed self.log.debug( - f"Deleting unmodified original file {self.unmodified_original}", + f"Deleting original file {self.input_doc.original_file}", ) - self.unmodified_original.unlink() + self.input_doc.original_file.unlink() + self.log.debug(f"Deleting working copy {self.working_copy}") + self.working_copy.unlink() + if self.unmodified_original is not None: # pragma: no cover + self.log.debug( + f"Deleting unmodified original file {self.unmodified_original}", + ) + self.unmodified_original.unlink() - # https://github.com/jonaswinkler/paperless-ng/discussions/1037 - shadow_file = ( - Path(self.input_doc.original_file).parent - / f"._{Path(self.input_doc.original_file).name}" + # https://github.com/jonaswinkler/paperless-ng/discussions/1037 + shadow_file = ( + Path(self.input_doc.original_file).parent + / f"._{Path(self.input_doc.original_file).name}" + ) + + if Path(shadow_file).is_file(): + self.log.debug(f"Deleting shadow file {shadow_file}") + Path(shadow_file).unlink() + + except Exception as e: + self._fail( + str(e), + f"The following error occurred while storing document " + f"{self.filename} after parsing: {e}", + exc_info=True, + exception=e, ) - - if Path(shadow_file).is_file(): - self.log.debug(f"Deleting shadow file {shadow_file}") - Path(shadow_file).unlink() - - except Exception as e: - self._fail( - str(e), - f"The following error occurred while storing document " - f"{self.filename} after parsing: {e}", - exc_info=True, - exception=e, - ) - finally: - _parser_cleanup(document_parser) - tempdir.cleanup() + finally: + tempdir.cleanup() self.run_post_consume_script(document) diff --git a/src/documents/management/commands/document_thumbnails.py b/src/documents/management/commands/document_thumbnails.py index 1756f8754..3d779ae18 100644 --- a/src/documents/management/commands/document_thumbnails.py +++ b/src/documents/management/commands/document_thumbnails.py @@ -3,19 +3,18 @@ import shutil from documents.management.commands.base import PaperlessCommand from documents.models import Document -from documents.parsers import get_parser_class_for_mime_type -from paperless.parsers.mail import MailDocumentParser -from paperless.parsers.remote import RemoteDocumentParser -from paperless.parsers.tesseract import RasterisedDocumentParser -from paperless.parsers.text import TextDocumentParser -from paperless.parsers.tika import TikaDocumentParser +from paperless.parsers.registry import get_parser_registry logger = logging.getLogger("paperless.management.thumbnails") def _process_document(doc_id: int) -> None: document: Document = Document.objects.get(id=doc_id) - parser_class = get_parser_class_for_mime_type(document.mime_type) + parser_class = get_parser_registry().get_parser_for_file( + document.mime_type, + document.original_filename or "", + document.source_path, + ) if parser_class is None: logger.warning( @@ -25,40 +24,9 @@ def _process_document(doc_id: int) -> None: ) return - parser = parser_class(logging_group=None) - - parser_is_new_style = isinstance( - parser, - ( - MailDocumentParser, - RasterisedDocumentParser, - RemoteDocumentParser, - TextDocumentParser, - TikaDocumentParser, - ), - ) - - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if parser_is_new_style: - parser.__enter__() - - try: - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if parser_is_new_style: - thumb = parser.get_thumbnail(document.source_path, document.mime_type) - else: - thumb = parser.get_thumbnail( - document.source_path, - document.mime_type, - document.get_public_filename(), - ) + with parser_class() as parser: + thumb = parser.get_thumbnail(document.source_path, document.mime_type) shutil.move(thumb, document.thumbnail_path) - finally: - # TODO(stumpylog): Cleanup once all parsers are handled - if parser_is_new_style: - parser.__exit__(None, None, None) - else: - parser.cleanup() class Command(PaperlessCommand): diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 372cf0491..69ee4e285 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -3,84 +3,47 @@ from __future__ import annotations import logging import mimetypes import os -import re import shutil import subprocess import tempfile -from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING from django.conf import settings from documents.loggers import LoggingMixin -from documents.signals import document_consumer_declaration from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.parsers.registry import get_parser_registry if TYPE_CHECKING: import datetime -# This regular expression will try to find dates in the document at -# hand and will match the following formats: -# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits -# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits -# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits -# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits -# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits -# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits -# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits -# - MONTH ZZZZ, with ZZZZ being 4 digits -# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits -# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters -# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits - -# TODO: isn't there a date parsing library for this? - -DATE_REGEX = re.compile( - r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))", - re.IGNORECASE, -) - - logger = logging.getLogger("paperless.parsing") -@lru_cache(maxsize=8) def is_mime_type_supported(mime_type: str) -> bool: """ Returns True if the mime type is supported, False otherwise """ - return get_parser_class_for_mime_type(mime_type) is not None + return get_parser_registry().get_parser_for_file(mime_type, "") is not None -@lru_cache(maxsize=8) def get_default_file_extension(mime_type: str) -> str: """ Returns the default file extension for a mimetype, or an empty string if it could not be determined """ - for response in document_consumer_declaration.send(None): - parser_declaration = response[1] - supported_mime_types = parser_declaration["mime_types"] - - if mime_type in supported_mime_types: - return supported_mime_types[mime_type] + parser_class = get_parser_registry().get_parser_for_file(mime_type, "") + if parser_class is not None: + supported = parser_class.supported_mime_types() + if mime_type in supported: + return supported[mime_type] ext = mimetypes.guess_extension(mime_type) - if ext: - return ext - else: - return "" + return ext if ext else "" -@lru_cache(maxsize=8) def is_file_ext_supported(ext: str) -> bool: """ Returns True if the file extension is supported, False otherwise @@ -94,44 +57,17 @@ def is_file_ext_supported(ext: str) -> bool: def get_supported_file_extensions() -> set[str]: extensions = set() - for response in document_consumer_declaration.send(None): - parser_declaration = response[1] - supported_mime_types = parser_declaration["mime_types"] - - for mime_type in supported_mime_types: + for parser_class in get_parser_registry().all_parsers(): + for mime_type, ext in parser_class.supported_mime_types().items(): extensions.update(mimetypes.guess_all_extensions(mime_type)) # Python's stdlib might be behind, so also add what the parser # says is the default extension # This makes image/webp supported on Python < 3.11 - extensions.add(supported_mime_types[mime_type]) + extensions.add(ext) return extensions -def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None: - """ - Returns the best parser (by weight) for the given mimetype or - None if no parser exists - """ - - options = [] - - for response in document_consumer_declaration.send(None): - parser_declaration = response[1] - supported_mime_types = parser_declaration["mime_types"] - - if mime_type in supported_mime_types: - options.append(parser_declaration) - - if not options: - return None - - best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0] - - # Return the parser with the highest weight. - return best_parser["parser"] - - def run_convert( input_file, output_file, diff --git a/src/documents/signals/__init__.py b/src/documents/signals/__init__.py index fbb55d9fe..864fec09f 100644 --- a/src/documents/signals/__init__.py +++ b/src/documents/signals/__init__.py @@ -2,5 +2,4 @@ from django.dispatch import Signal document_consumption_started = Signal() document_consumption_finished = Signal() -document_consumer_declaration = Signal() document_updated = Signal() diff --git a/src/documents/tasks.py b/src/documents/tasks.py index a8ca0cc5f..751990c62 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -52,8 +52,6 @@ from documents.models import StoragePath from documents.models import Tag from documents.models import WorkflowRun from documents.models import WorkflowTrigger -from documents.parsers import DocumentParser -from documents.parsers import get_parser_class_for_mime_type from documents.plugins.base import ConsumeTaskPlugin from documents.plugins.base import ProgressManager from documents.plugins.base import StopConsumeTaskError @@ -66,11 +64,7 @@ from documents.signals.handlers import send_websocket_document_updated from documents.workflows.utils import get_workflows_for_trigger from paperless.config import AIConfig from paperless.parsers import ParserContext -from paperless.parsers.mail import MailDocumentParser -from paperless.parsers.remote import RemoteDocumentParser -from paperless.parsers.tesseract import RasterisedDocumentParser -from paperless.parsers.text import TextDocumentParser -from paperless.parsers.tika import TikaDocumentParser +from paperless.parsers.registry import get_parser_registry from paperless_ai.indexing import llm_index_add_or_update_document from paperless_ai.indexing import llm_index_remove_document from paperless_ai.indexing import update_llm_index @@ -310,8 +304,10 @@ def update_document_content_maybe_archive_file(document_id) -> None: mime_type = document.mime_type - parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type( + parser_class = get_parser_registry().get_parser_for_file( mime_type, + document.original_filename or "", + document.source_path, ) if not parser_class: @@ -321,138 +317,92 @@ def update_document_content_maybe_archive_file(document_id) -> None: ) return - parser: DocumentParser = parser_class(logging_group=uuid.uuid4()) + with parser_class() as parser: + parser.configure(ParserContext()) - parser_is_new_style = isinstance( - parser, - ( - MailDocumentParser, - RasterisedDocumentParser, - RemoteDocumentParser, - TextDocumentParser, - TikaDocumentParser, - ), - ) - - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if parser_is_new_style: - parser.__enter__() - - try: - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if parser_is_new_style: - parser.configure(ParserContext()) + try: parser.parse(document.source_path, mime_type) - else: - parser.parse( - document.source_path, - mime_type, - document.get_public_filename(), - ) - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if parser_is_new_style: thumbnail = parser.get_thumbnail(document.source_path, mime_type) - else: - thumbnail = parser.get_thumbnail( - document.source_path, - mime_type, - document.get_public_filename(), - ) - with transaction.atomic(): - oldDocument = Document.objects.get(pk=document.pk) - if parser.get_archive_path(): - with Path(parser.get_archive_path()).open("rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() - # I'm going to save first so that in case the file move - # fails, the database is rolled back. - # We also don't use save() since that triggers the filehandling - # logic, and we don't want that yet (file not yet in place) - document.archive_filename = generate_unique_filename( - document, - archive_filename=True, - ) - Document.objects.filter(pk=document.pk).update( - archive_checksum=checksum, - content=parser.get_text(), - archive_filename=document.archive_filename, - ) - newDocument = Document.objects.get(pk=document.pk) - if settings.AUDIT_LOG_ENABLED: - LogEntry.objects.log_create( - instance=oldDocument, - changes={ - "content": [oldDocument.content, newDocument.content], - "archive_checksum": [ - oldDocument.archive_checksum, - newDocument.archive_checksum, - ], - "archive_filename": [ - oldDocument.archive_filename, - newDocument.archive_filename, - ], - }, - additional_data={ - "reason": "Update document content", - }, - action=LogEntry.Action.UPDATE, - ) - else: - Document.objects.filter(pk=document.pk).update( - content=parser.get_text(), - ) - - if settings.AUDIT_LOG_ENABLED: - LogEntry.objects.log_create( - instance=oldDocument, - changes={ - "content": [oldDocument.content, parser.get_text()], - }, - additional_data={ - "reason": "Update document content", - }, - action=LogEntry.Action.UPDATE, - ) - - with FileLock(settings.MEDIA_LOCK): + with transaction.atomic(): + oldDocument = Document.objects.get(pk=document.pk) if parser.get_archive_path(): - create_source_path_directory(document.archive_path) - shutil.move(parser.get_archive_path(), document.archive_path) - shutil.move(thumbnail, document.thumbnail_path) + with Path(parser.get_archive_path()).open("rb") as f: + checksum = hashlib.md5(f.read()).hexdigest() + # I'm going to save first so that in case the file move + # fails, the database is rolled back. + # We also don't use save() since that triggers the filehandling + # logic, and we don't want that yet (file not yet in place) + document.archive_filename = generate_unique_filename( + document, + archive_filename=True, + ) + Document.objects.filter(pk=document.pk).update( + archive_checksum=checksum, + content=parser.get_text(), + archive_filename=document.archive_filename, + ) + newDocument = Document.objects.get(pk=document.pk) + if settings.AUDIT_LOG_ENABLED: + LogEntry.objects.log_create( + instance=oldDocument, + changes={ + "content": [oldDocument.content, newDocument.content], + "archive_checksum": [ + oldDocument.archive_checksum, + newDocument.archive_checksum, + ], + "archive_filename": [ + oldDocument.archive_filename, + newDocument.archive_filename, + ], + }, + additional_data={ + "reason": "Update document content", + }, + action=LogEntry.Action.UPDATE, + ) + else: + Document.objects.filter(pk=document.pk).update( + content=parser.get_text(), + ) - document.refresh_from_db() - logger.info( - f"Updating index for document {document_id} ({document.archive_checksum})", - ) - with index.open_index_writer() as writer: - index.update_document(writer, document) + if settings.AUDIT_LOG_ENABLED: + LogEntry.objects.log_create( + instance=oldDocument, + changes={ + "content": [oldDocument.content, parser.get_text()], + }, + additional_data={ + "reason": "Update document content", + }, + action=LogEntry.Action.UPDATE, + ) - ai_config = AIConfig() - if ai_config.llm_index_enabled: - llm_index_add_or_update_document(document) + with FileLock(settings.MEDIA_LOCK): + if parser.get_archive_path(): + create_source_path_directory(document.archive_path) + shutil.move(parser.get_archive_path(), document.archive_path) + shutil.move(thumbnail, document.thumbnail_path) - clear_document_caches(document.pk) + document.refresh_from_db() + logger.info( + f"Updating index for document {document_id} ({document.archive_checksum})", + ) + with index.open_index_writer() as writer: + index.update_document(writer, document) - except Exception: - logger.exception( - f"Error while parsing document {document} (ID: {document_id})", - ) - finally: - # TODO(stumpylog): Remove branch in the future when all parsers use new protocol - if isinstance( - parser, - ( - MailDocumentParser, - RasterisedDocumentParser, - RemoteDocumentParser, - TextDocumentParser, - TikaDocumentParser, - ), - ): - parser.__exit__(None, None, None) - else: - parser.cleanup() + ai_config = AIConfig() + if ai_config.llm_index_enabled: + llm_index_add_or_update_document(document) + + clear_document_caches(document.pk) + + except Exception: + logger.exception( + f"Error while parsing document {document} (ID: {document_id})", + ) @shared_task diff --git a/src/documents/tests/test_checks.py b/src/documents/tests/test_checks.py index b78946ba9..51d9cdddc 100644 --- a/src/documents/tests/test_checks.py +++ b/src/documents/tests/test_checks.py @@ -13,8 +13,10 @@ class TestDocumentChecks(TestCase): def test_parser_check(self) -> None: self.assertEqual(parser_check(None), []) - with mock.patch("documents.checks.document_consumer_declaration.send") as m: - m.return_value = [] + with mock.patch("documents.checks.get_parser_registry") as mock_registry_fn: + mock_registry = mock.MagicMock() + mock_registry.all_parsers.return_value = [] + mock_registry_fn.return_value = mock_registry self.assertEqual( parser_check(None), diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index a3574fdce..df4c7d9c4 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -27,7 +27,6 @@ from documents.models import Document from documents.models import DocumentType from documents.models import StoragePath from documents.models import Tag -from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.plugins.helpers import ProgressStatusOptions from documents.tasks import sanity_check @@ -38,62 +37,106 @@ from documents.tests.utils import GetConsumerMixin from paperless_mail.models import MailRule -class _BaseTestParser(DocumentParser): - def get_settings(self) -> None: +class _BaseNewStyleParser: + """Minimal ParserProtocol implementation for use in consumer tests.""" + + name: str = "test-parser" + version: str = "0.1" + author: str = "test" + url: str = "test" + + @classmethod + def supported_mime_types(cls) -> dict: + return { + "application/pdf": ".pdf", + "image/png": ".png", + "message/rfc822": ".eml", + } + + @classmethod + def score(cls, mime_type: str, filename: str, path=None): + return 0 if mime_type in cls.supported_mime_types() else None + + @property + def can_produce_archive(self) -> bool: + return True + + @property + def requires_pdf_rendition(self) -> bool: + return False + + def __init__(self) -> None: + self._tmpdir: Path | None = None + self._text: str | None = None + self._archive: Path | None = None + self._thumb: Path | None = None + + def __enter__(self): + self._tmpdir = Path( + tempfile.mkdtemp(prefix="paperless-test-", dir=settings.SCRATCH_DIR), + ) + _, thumb = tempfile.mkstemp(suffix=".webp", dir=self._tmpdir) + self._thumb = Path(thumb) + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + if self._tmpdir and self._tmpdir.exists(): + shutil.rmtree(self._tmpdir, ignore_errors=True) + + def configure(self, context) -> None: """ - This parser does not implement additional settings yet + Test parser doesn't do anything with context """ + + def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None: + raise NotImplementedError + + def get_text(self) -> str | None: + return self._text + + def get_date(self): return None + def get_archive_path(self): + return self._archive -class DummyParser(_BaseTestParser): - def __init__(self, logging_group, scratch_dir, archive_path) -> None: - super().__init__(logging_group, None) - _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) - self.archive_path = archive_path + def get_thumbnail(self, document_path, mime_type) -> Path: + return self._thumb - def get_thumbnail(self, document_path, mime_type, file_name=None): - return self.fake_thumb + def get_page_count(self, document_path, mime_type): + return None - def parse(self, document_path, mime_type, file_name=None) -> None: - self.text = "The Text" + def extract_metadata(self, document_path, mime_type) -> list: + return [] -class CopyParser(_BaseTestParser): - def get_thumbnail(self, document_path, mime_type, file_name=None): - return self.fake_thumb +class DummyParser(_BaseNewStyleParser): + _ARCHIVE_SRC = ( + Path(__file__).parent / "samples" / "documents" / "archive" / "0000001.pdf" + ) - def __init__(self, logging_group, progress_callback=None) -> None: - super().__init__(logging_group, progress_callback) - _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir) - - def parse(self, document_path, mime_type, file_name=None) -> None: - self.text = "The text" - self.archive_path = Path(self.tempdir / "archive.pdf") - shutil.copy(document_path, self.archive_path) + def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None: + self._text = "The Text" + if produce_archive and self._tmpdir: + self._archive = self._tmpdir / "archive.pdf" + shutil.copy(self._ARCHIVE_SRC, self._archive) -class FaultyParser(_BaseTestParser): - def __init__(self, logging_group, scratch_dir) -> None: - super().__init__(logging_group) - _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) +class CopyParser(_BaseNewStyleParser): + def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None: + self._text = "The text" + if produce_archive and self._tmpdir: + self._archive = self._tmpdir / "archive.pdf" + shutil.copy(document_path, self._archive) - def get_thumbnail(self, document_path, mime_type, file_name=None): - return self.fake_thumb - def parse(self, document_path, mime_type, file_name=None): +class FaultyParser(_BaseNewStyleParser): + def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None: raise ParseError("Does not compute.") -class FaultyGenericExceptionParser(_BaseTestParser): - def __init__(self, logging_group, scratch_dir) -> None: - super().__init__(logging_group) - _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) - - def get_thumbnail(self, document_path, mime_type, file_name=None): - return self.fake_thumb - - def parse(self, document_path, mime_type, file_name=None): +class FaultyGenericExceptionParser(_BaseNewStyleParser): + def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None: raise Exception("Generic exception.") @@ -147,38 +190,12 @@ class TestConsumer( self.assertEqual(payload["data"]["max_progress"], last_progress_max) self.assertEqual(payload["data"]["status"], last_status) - def make_dummy_parser(self, logging_group, progress_callback=None): - return DummyParser( - logging_group, - self.dirs.scratch_dir, - self.get_test_archive_file(), - ) - - def make_faulty_parser(self, logging_group, progress_callback=None): - return FaultyParser(logging_group, self.dirs.scratch_dir) - - def make_faulty_generic_exception_parser( - self, - logging_group, - progress_callback=None, - ): - return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir) - def setUp(self) -> None: super().setUp() - patcher = mock.patch("documents.parsers.document_consumer_declaration.send") - m = patcher.start() - m.return_value = [ - ( - None, - { - "parser": self.make_dummy_parser, - "mime_types": {"application/pdf": ".pdf"}, - "weight": 0, - }, - ), - ] + patcher = mock.patch("documents.consumer.get_parser_registry") + mock_registry = patcher.start() + mock_registry.return_value.get_parser_for_file.return_value = DummyParser self.addCleanup(patcher.stop) def get_test_file(self): @@ -547,9 +564,9 @@ class TestConsumer( ) as consumer: consumer.run() - @mock.patch("documents.parsers.document_consumer_declaration.send") + @mock.patch("documents.consumer.get_parser_registry") def testNoParsers(self, m) -> None: - m.return_value = [] + m.return_value.get_parser_for_file.return_value = None with self.assertRaisesMessage( ConsumerError, @@ -560,18 +577,9 @@ class TestConsumer( self._assert_first_last_send_progress(last_status="FAILED") - @mock.patch("documents.parsers.document_consumer_declaration.send") + @mock.patch("documents.consumer.get_parser_registry") def testFaultyParser(self, m) -> None: - m.return_value = [ - ( - None, - { - "parser": self.make_faulty_parser, - "mime_types": {"application/pdf": ".pdf"}, - "weight": 0, - }, - ), - ] + m.return_value.get_parser_for_file.return_value = FaultyParser with self.get_consumer(self.get_test_file()) as consumer: with self.assertRaisesMessage( @@ -582,18 +590,9 @@ class TestConsumer( self._assert_first_last_send_progress(last_status="FAILED") - @mock.patch("documents.parsers.document_consumer_declaration.send") + @mock.patch("documents.consumer.get_parser_registry") def testGenericParserException(self, m) -> None: - m.return_value = [ - ( - None, - { - "parser": self.make_faulty_generic_exception_parser, - "mime_types": {"application/pdf": ".pdf"}, - "weight": 0, - }, - ), - ] + m.return_value.get_parser_for_file.return_value = FaultyGenericExceptionParser with self.get_consumer(self.get_test_file()) as consumer: with self.assertRaisesMessage( @@ -1017,7 +1016,7 @@ class TestConsumer( self._assert_first_last_send_progress() @override_settings(FILENAME_FORMAT="{title}") - @mock.patch("documents.parsers.document_consumer_declaration.send") + @mock.patch("documents.consumer.get_parser_registry") def test_similar_filenames(self, m) -> None: shutil.copy( Path(__file__).parent / "samples" / "simple.pdf", @@ -1031,16 +1030,7 @@ class TestConsumer( Path(__file__).parent / "samples" / "simple-noalpha.png", settings.CONSUMPTION_DIR / "simple.png.pdf", ) - m.return_value = [ - ( - None, - { - "parser": CopyParser, - "mime_types": {"application/pdf": ".pdf", "image/png": ".png"}, - "weight": 0, - }, - ), - ] + m.return_value.get_parser_for_file.return_value = CopyParser with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer: consumer.run() @@ -1068,8 +1058,10 @@ class TestConsumer( sanity_check() + @mock.patch("documents.consumer.get_parser_registry") @mock.patch("documents.consumer.run_subprocess") - def test_try_to_clean_invalid_pdf(self, m) -> None: + def test_try_to_clean_invalid_pdf(self, m, mock_registry) -> None: + mock_registry.return_value.get_parser_for_file.return_value = None shutil.copy( Path(__file__).parent / "samples" / "invalid_pdf.pdf", settings.CONSUMPTION_DIR / "invalid_pdf.pdf", @@ -1091,10 +1083,10 @@ class TestConsumer( @mock.patch("paperless_mail.models.MailRule.objects.get") @mock.patch("paperless.parsers.mail.MailDocumentParser.parse") - @mock.patch("documents.parsers.document_consumer_declaration.send") + @mock.patch("documents.consumer.get_parser_registry") def test_mail_parser_receives_mailrule( self, - mock_consumer_declaration_send: mock.Mock, + mock_get_parser_registry: mock.Mock, mock_mail_parser_parse: mock.Mock, mock_mailrule_get: mock.Mock, ) -> None: @@ -1106,18 +1098,11 @@ class TestConsumer( THEN: - The mail parser should receive the mail rule """ - from paperless_mail.signals import get_parser as mail_get_parser + from paperless.parsers.mail import MailDocumentParser - mock_consumer_declaration_send.return_value = [ - ( - None, - { - "parser": mail_get_parser, - "mime_types": {"message/rfc822": ".eml"}, - "weight": 0, - }, - ), - ] + mock_get_parser_registry.return_value.get_parser_for_file.return_value = ( + MailDocumentParser + ) mock_mailrule_get.return_value = mock.Mock( pdf_layout=MailRule.PdfLayout.HTML_ONLY, ) diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 5ea1b361e..30963df70 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -1,132 +1,16 @@ -from tempfile import TemporaryDirectory -from unittest import mock - -from django.apps import apps from django.test import TestCase from django.test import override_settings from documents.parsers import get_default_file_extension -from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_supported_file_extensions from documents.parsers import is_file_ext_supported +from paperless.parsers.registry import get_parser_registry +from paperless.parsers.registry import reset_parser_registry from paperless.parsers.tesseract import RasterisedDocumentParser from paperless.parsers.text import TextDocumentParser from paperless.parsers.tika import TikaDocumentParser -class TestParserDiscovery(TestCase): - @mock.patch("documents.parsers.document_consumer_declaration.send") - def test_get_parser_class_1_parser(self, m, *args) -> None: - """ - GIVEN: - - Parser declared for a given mimetype - WHEN: - - Attempt to get parser for the mimetype - THEN: - - Declared parser class is returned - """ - - class DummyParser: - pass - - m.return_value = ( - ( - None, - { - "weight": 0, - "parser": DummyParser, - "mime_types": {"application/pdf": ".pdf"}, - }, - ), - ) - - self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser) - - @mock.patch("documents.parsers.document_consumer_declaration.send") - def test_get_parser_class_n_parsers(self, m, *args) -> None: - """ - GIVEN: - - Two parsers declared for a given mimetype - - Second parser has a higher weight - WHEN: - - Attempt to get parser for the mimetype - THEN: - - Second parser class is returned - """ - - class DummyParser1: - pass - - class DummyParser2: - pass - - m.return_value = ( - ( - None, - { - "weight": 0, - "parser": DummyParser1, - "mime_types": {"application/pdf": ".pdf"}, - }, - ), - ( - None, - { - "weight": 1, - "parser": DummyParser2, - "mime_types": {"application/pdf": ".pdf"}, - }, - ), - ) - - self.assertEqual( - get_parser_class_for_mime_type("application/pdf"), - DummyParser2, - ) - - @mock.patch("documents.parsers.document_consumer_declaration.send") - def test_get_parser_class_0_parsers(self, m, *args) -> None: - """ - GIVEN: - - No parsers are declared - WHEN: - - Attempt to get parser for the mimetype - THEN: - - No parser class is returned - """ - m.return_value = [] - with TemporaryDirectory(): - self.assertIsNone(get_parser_class_for_mime_type("application/pdf")) - - @mock.patch("documents.parsers.document_consumer_declaration.send") - def test_get_parser_class_no_valid_parser(self, m, *args) -> None: - """ - GIVEN: - - No parser declared for a given mimetype - - Parser declared for a different mimetype - WHEN: - - Attempt to get parser for the given mimetype - THEN: - - No parser class is returned - """ - - class DummyParser: - pass - - m.return_value = ( - ( - None, - { - "weight": 0, - "parser": DummyParser, - "mime_types": {"application/pdf": ".pdf"}, - }, - ), - ) - - self.assertIsNone(get_parser_class_for_mime_type("image/tiff")) - - class TestParserAvailability(TestCase): def test_tesseract_parser(self) -> None: """ @@ -151,7 +35,7 @@ class TestParserAvailability(TestCase): self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) self.assertIsInstance( - get_parser_class_for_mime_type(mime_type)(logging_group=None), + get_parser_registry().get_parser_for_file(mime_type, "")(), RasterisedDocumentParser, ) @@ -175,7 +59,7 @@ class TestParserAvailability(TestCase): self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) self.assertIsInstance( - get_parser_class_for_mime_type(mime_type)(logging_group=None), + get_parser_registry().get_parser_for_file(mime_type, "")(), TextDocumentParser, ) @@ -198,22 +82,23 @@ class TestParserAvailability(TestCase): ), ] - # Force the app ready to notice the settings override - with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]): - app = apps.get_app_config("paperless_tika") - app.ready() + self.addCleanup(reset_parser_registry) + + # Reset and rebuild the registry with Tika enabled. + with override_settings(TIKA_ENABLED=True): + reset_parser_registry() supported_exts = get_supported_file_extensions() - for mime_type, ext in supported_mimes_and_exts: - self.assertIn(ext, supported_exts) - self.assertEqual(get_default_file_extension(mime_type), ext) - self.assertIsInstance( - get_parser_class_for_mime_type(mime_type)(logging_group=None), - TikaDocumentParser, - ) + for mime_type, ext in supported_mimes_and_exts: + self.assertIn(ext, supported_exts) + self.assertEqual(get_default_file_extension(mime_type), ext) + self.assertIsInstance( + get_parser_registry().get_parser_for_file(mime_type, "")(), + TikaDocumentParser, + ) def test_no_parser_for_mime(self) -> None: - self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf")) + self.assertIsNone(get_parser_registry().get_parser_for_file("text/sdgsdf", "")) def test_default_extension(self) -> None: # Test no parser declared still returns a an extension diff --git a/src/documents/views.py b/src/documents/views.py index ffdc309fd..0716ce66d 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -7,7 +7,6 @@ import tempfile import zipfile from collections import defaultdict from collections import deque -from contextlib import nullcontext from datetime import datetime from pathlib import Path from time import mktime @@ -159,7 +158,6 @@ from documents.models import UiSettings from documents.models import Workflow from documents.models import WorkflowAction from documents.models import WorkflowTrigger -from documents.parsers import get_parser_class_for_mime_type from documents.permissions import AcknowledgeTasksPermissions from documents.permissions import PaperlessAdminPermissions from documents.permissions import PaperlessNotePermissions @@ -227,7 +225,7 @@ from paperless.celery import app as celery_app from paperless.config import AIConfig from paperless.config import GeneralConfig from paperless.models import ApplicationConfiguration -from paperless.parsers import ParserProtocol +from paperless.parsers.registry import get_parser_registry from paperless.serialisers import GroupSerializer from paperless.serialisers import UserSerializer from paperless.views import StandardPagination @@ -1084,17 +1082,17 @@ class DocumentViewSet( if not Path(file).is_file(): return None - parser_class = get_parser_class_for_mime_type(mime_type) + parser_class = get_parser_registry().get_parser_for_file( + mime_type, + Path(file).name, + Path(file), + ) if parser_class: - parser = parser_class(progress_callback=None, logging_group=None) - cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser) - try: - with cm: + with parser_class() as parser: return parser.extract_metadata(file, mime_type) except Exception: # pragma: no cover logger.exception(f"Issue getting metadata for {file}") - # TODO: cover GPG errors, remove later. return [] else: # pragma: no cover logger.warning(f"No parser for {mime_type}") diff --git a/src/paperless/checks.py b/src/paperless/checks.py index bcea6ef24..5f069b547 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -3,6 +3,7 @@ import os import pwd import shutil import stat +import subprocess from pathlib import Path from django.conf import settings @@ -299,3 +300,62 @@ def check_deprecated_db_settings( ) return warnings + + +@register() +def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]: + if settings.REMOTE_OCR_ENGINE == "azureai" and not ( + settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY + ): + return [ + Error( + "Azure AI remote parser requires endpoint and API key to be configured.", + ), + ] + + return [] + + +def get_tesseract_langs(): + proc = subprocess.run( + [shutil.which("tesseract"), "--list-langs"], + capture_output=True, + ) + + # Decode bytes to string, split on newlines, trim out the header + proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:] + + return [x.strip() for x in proc_lines] + + +@register() +def check_default_language_available(app_configs, **kwargs): + errs = [] + + if not settings.OCR_LANGUAGE: + errs.append( + Warning( + "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. " + "This means that tesseract will fallback to english.", + ), + ) + return errs + + # binaries_check in paperless will check and report if this doesn't exist + # So skip trying to do anything here and let that handle missing binaries + if shutil.which("tesseract") is not None: + installed_langs = get_tesseract_langs() + + specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")] + + for lang in specified_langs: + if lang not in installed_langs: + errs.append( + Error( + f"The selected ocr language {lang} is " + f"not installed. Paperless cannot OCR your documents " + f"without it. Please fix PAPERLESS_OCR_LANGUAGE.", + ), + ) + + return errs diff --git a/src/paperless/parsers/registry.py b/src/paperless/parsers/registry.py index 7effe554f..c81fb1c45 100644 --- a/src/paperless/parsers/registry.py +++ b/src/paperless/parsers/registry.py @@ -33,6 +33,7 @@ name, version, author, url, supported_mime_types (callable), score (callable). from __future__ import annotations import logging +import threading from importlib.metadata import entry_points from typing import TYPE_CHECKING @@ -49,6 +50,7 @@ logger = logging.getLogger("paperless.parsers.registry") _registry: ParserRegistry | None = None _discovery_complete: bool = False +_lock = threading.Lock() # Attribute names that every registered external parser class must expose. _REQUIRED_ATTRS: tuple[str, ...] = ( @@ -74,7 +76,6 @@ def get_parser_registry() -> ParserRegistry: 1. Creates a new ParserRegistry. 2. Calls register_defaults to install built-in parsers. 3. Calls discover to load third-party plugins via importlib.metadata entrypoints. - 4. Calls log_summary to emit a startup summary. Subsequent calls return the same instance immediately. @@ -85,14 +86,15 @@ def get_parser_registry() -> ParserRegistry: """ global _registry, _discovery_complete - if _registry is None: - _registry = ParserRegistry() - _registry.register_defaults() + with _lock: + if _registry is None: + r = ParserRegistry() + r.register_defaults() + _registry = r - if not _discovery_complete: - _registry.discover() - _registry.log_summary() - _discovery_complete = True + if not _discovery_complete: + _registry.discover() + _discovery_complete = True return _registry @@ -113,9 +115,11 @@ def init_builtin_parsers() -> None: """ global _registry - if _registry is None: - _registry = ParserRegistry() - _registry.register_defaults() + with _lock: + if _registry is None: + r = ParserRegistry() + r.register_defaults() + _registry = r def reset_parser_registry() -> None: @@ -304,6 +308,23 @@ class ParserRegistry: getattr(cls, "url", "unknown"), ) + # ------------------------------------------------------------------ + # Inspection helpers + # ------------------------------------------------------------------ + + def all_parsers(self) -> list[type[ParserProtocol]]: + """Return all registered parser classes (external first, then builtins). + + Used by compatibility wrappers that need to iterate every parser to + compute the full set of supported MIME types and file extensions. + + Returns + ------- + list[type[ParserProtocol]] + External parsers followed by built-in parsers. + """ + return [*self._external, *self._builtins] + # ------------------------------------------------------------------ # Parser resolution # ------------------------------------------------------------------ @@ -334,7 +355,7 @@ class ParserRegistry: mime_type: The detected MIME type of the file. filename: - The original filename, including extension. + The original filename, including extension. May be empty in some cases path: Optional filesystem path to the file. Forwarded to each parser's score method. diff --git a/src/paperless/settings/__init__.py b/src/paperless/settings/__init__.py index 011f776b5..1c33db7c6 100644 --- a/src/paperless/settings/__init__.py +++ b/src/paperless/settings/__init__.py @@ -121,10 +121,7 @@ INSTALLED_APPS = [ "django_extensions", "paperless", "documents.apps.DocumentsConfig", - "paperless_tesseract.apps.PaperlessTesseractConfig", - "paperless_text.apps.PaperlessTextConfig", "paperless_mail.apps.PaperlessMailConfig", - "paperless_remote.apps.PaperlessRemoteParserConfig", "django.contrib.admin", "rest_framework", "rest_framework.authtoken", @@ -974,8 +971,8 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv( "http://localhost:3000", ) -if TIKA_ENABLED: - INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") +# Tika parser is now integrated into the main parser registry +# No separate Django app needed AUDIT_LOG_ENABLED = get_bool_from_env("PAPERLESS_AUDIT_LOG_ENABLED", "true") if AUDIT_LOG_ENABLED: diff --git a/src/paperless/tests/parsers/conftest.py b/src/paperless/tests/parsers/conftest.py index a484f02c8..8747ac9bd 100644 --- a/src/paperless/tests/parsers/conftest.py +++ b/src/paperless/tests/parsers/conftest.py @@ -90,35 +90,6 @@ def text_parser() -> Generator[TextDocumentParser, None, None]: yield parser -# ------------------------------------------------------------------ -# Remote parser sample files -# ------------------------------------------------------------------ - - -@pytest.fixture(scope="session") -def remote_samples_dir(samples_dir: Path) -> Path: - """Absolute path to the remote parser sample files directory. - - Returns - ------- - Path - ``/remote/`` - """ - return samples_dir / "remote" - - -@pytest.fixture(scope="session") -def sample_pdf_file(remote_samples_dir: Path) -> Path: - """Path to a simple digital PDF sample file. - - Returns - ------- - Path - Absolute path to ``remote/simple-digital.pdf``. - """ - return remote_samples_dir / "simple-digital.pdf" - - # ------------------------------------------------------------------ # Remote parser instance # ------------------------------------------------------------------ diff --git a/src/paperless/tests/parsers/test_remote_parser.py b/src/paperless/tests/parsers/test_remote_parser.py index 69199a6e8..892915bb5 100644 --- a/src/paperless/tests/parsers/test_remote_parser.py +++ b/src/paperless/tests/parsers/test_remote_parser.py @@ -277,20 +277,20 @@ class TestRemoteParserParse: def test_parse_returns_text_from_azure( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, azure_client: Mock, ) -> None: - remote_parser.parse(sample_pdf_file, "application/pdf") + remote_parser.parse(simple_digital_pdf_file, "application/pdf") assert remote_parser.get_text() == _DEFAULT_TEXT def test_parse_sets_archive_path( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, azure_client: Mock, ) -> None: - remote_parser.parse(sample_pdf_file, "application/pdf") + remote_parser.parse(simple_digital_pdf_file, "application/pdf") archive = remote_parser.get_archive_path() assert archive is not None @@ -300,11 +300,11 @@ class TestRemoteParserParse: def test_parse_closes_client_on_success( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, azure_client: Mock, ) -> None: remote_parser.configure(ParserContext()) - remote_parser.parse(sample_pdf_file, "application/pdf") + remote_parser.parse(simple_digital_pdf_file, "application/pdf") azure_client.close.assert_called_once() @@ -312,9 +312,9 @@ class TestRemoteParserParse: def test_parse_sets_empty_text_when_not_configured( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, ) -> None: - remote_parser.parse(sample_pdf_file, "application/pdf") + remote_parser.parse(simple_digital_pdf_file, "application/pdf") assert remote_parser.get_text() == "" assert remote_parser.get_archive_path() is None @@ -328,10 +328,10 @@ class TestRemoteParserParse: def test_get_date_always_none( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, azure_client: Mock, ) -> None: - remote_parser.parse(sample_pdf_file, "application/pdf") + remote_parser.parse(simple_digital_pdf_file, "application/pdf") assert remote_parser.get_date() is None @@ -345,33 +345,33 @@ class TestRemoteParserParseError: def test_parse_returns_none_on_azure_error( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, failing_azure_client: Mock, ) -> None: - remote_parser.parse(sample_pdf_file, "application/pdf") + remote_parser.parse(simple_digital_pdf_file, "application/pdf") assert remote_parser.get_text() is None def test_parse_closes_client_on_error( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, failing_azure_client: Mock, ) -> None: - remote_parser.parse(sample_pdf_file, "application/pdf") + remote_parser.parse(simple_digital_pdf_file, "application/pdf") failing_azure_client.close.assert_called_once() def test_parse_logs_error_on_azure_failure( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, failing_azure_client: Mock, mocker: MockerFixture, ) -> None: mock_log = mocker.patch("paperless.parsers.remote.logger") - remote_parser.parse(sample_pdf_file, "application/pdf") + remote_parser.parse(simple_digital_pdf_file, "application/pdf") mock_log.error.assert_called_once() assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0] @@ -386,18 +386,18 @@ class TestRemoteParserPageCount: def test_page_count_for_pdf( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, ) -> None: - count = remote_parser.get_page_count(sample_pdf_file, "application/pdf") + count = remote_parser.get_page_count(simple_digital_pdf_file, "application/pdf") assert isinstance(count, int) assert count >= 1 def test_page_count_returns_none_for_image_mime( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, ) -> None: - count = remote_parser.get_page_count(sample_pdf_file, "image/png") + count = remote_parser.get_page_count(simple_digital_pdf_file, "image/png") assert count is None def test_page_count_returns_none_for_invalid_pdf( @@ -420,25 +420,31 @@ class TestRemoteParserMetadata: def test_extract_metadata_non_pdf_returns_empty( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, ) -> None: - result = remote_parser.extract_metadata(sample_pdf_file, "image/png") + result = remote_parser.extract_metadata(simple_digital_pdf_file, "image/png") assert result == [] def test_extract_metadata_pdf_returns_list( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, ) -> None: - result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf") + result = remote_parser.extract_metadata( + simple_digital_pdf_file, + "application/pdf", + ) assert isinstance(result, list) def test_extract_metadata_pdf_entries_have_required_keys( self, remote_parser: RemoteDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, ) -> None: - result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf") + result = remote_parser.extract_metadata( + simple_digital_pdf_file, + "application/pdf", + ) for entry in result: assert "namespace" in entry assert "prefix" in entry diff --git a/src/paperless/tests/parsers/test_tika_parser.py b/src/paperless/tests/parsers/test_tika_parser.py index 010969259..560527934 100644 --- a/src/paperless/tests/parsers/test_tika_parser.py +++ b/src/paperless/tests/parsers/test_tika_parser.py @@ -77,10 +77,10 @@ class TestTikaParserRegistryInterface: def test_get_page_count_returns_int_with_pdf_archive( self, tika_parser: TikaDocumentParser, - sample_pdf_file: Path, + simple_digital_pdf_file: Path, ) -> None: - tika_parser._archive_path = sample_pdf_file - count = tika_parser.get_page_count(sample_pdf_file, "application/pdf") + tika_parser._archive_path = simple_digital_pdf_file + count = tika_parser.get_page_count(simple_digital_pdf_file, "application/pdf") assert isinstance(count, int) assert count > 0 diff --git a/src/paperless/tests/samples/remote/simple-digital.pdf b/src/paperless/tests/samples/remote/simple-digital.pdf deleted file mode 100644 index e450de482..000000000 Binary files a/src/paperless/tests/samples/remote/simple-digital.pdf and /dev/null differ diff --git a/src/paperless/tests/test_checks.py b/src/paperless/tests/test_checks.py index 3572f02a4..87e64a90e 100644 --- a/src/paperless/tests/test_checks.py +++ b/src/paperless/tests/test_checks.py @@ -5,6 +5,7 @@ from pathlib import Path from unittest import mock import pytest +from django.core.checks import ERROR from django.core.checks import Error from django.core.checks import Warning from pytest_django.fixtures import SettingsWrapper @@ -12,7 +13,9 @@ from pytest_mock import MockerFixture from paperless.checks import audit_log_check from paperless.checks import binaries_check +from paperless.checks import check_default_language_available from paperless.checks import check_deprecated_db_settings +from paperless.checks import check_remote_parser_configured from paperless.checks import check_v3_minimum_upgrade_version from paperless.checks import debug_mode_check from paperless.checks import paths_check @@ -626,3 +629,116 @@ class TestV3MinimumUpgradeVersionCheck: conn.introspection.table_names.side_effect = OperationalError("DB unavailable") mocker.patch.dict("paperless.checks.connections", {"default": conn}) assert check_v3_minimum_upgrade_version(None) == [] + + +class TestRemoteParserChecks: + def test_no_engine(self, settings: SettingsWrapper) -> None: + settings.REMOTE_OCR_ENGINE = None + msgs = check_remote_parser_configured(None) + + assert len(msgs) == 0 + + def test_azure_no_endpoint(self, settings: SettingsWrapper) -> None: + + settings.REMOTE_OCR_ENGINE = "azureai" + settings.REMOTE_OCR_API_KEY = "somekey" + settings.REMOTE_OCR_ENDPOINT = None + + msgs = check_remote_parser_configured(None) + + assert len(msgs) == 1 + + msg = msgs[0] + + assert ( + "Azure AI remote parser requires endpoint and API key to be configured." + in msg.msg + ) + + +class TestTesseractChecks: + def test_default_language(self) -> None: + check_default_language_available(None) + + def test_no_language(self, settings: SettingsWrapper) -> None: + + settings.OCR_LANGUAGE = "" + + msgs = check_default_language_available(None) + + assert len(msgs) == 1 + msg = msgs[0] + + assert ( + "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE" in msg.msg + ) + + def test_invalid_language( + self, + settings: SettingsWrapper, + mocker: MockerFixture, + ) -> None: + + settings.OCR_LANGUAGE = "ita" + + tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs") + tesser_lang_mock.return_value = ["deu", "eng"] + + msgs = check_default_language_available(None) + + assert len(msgs) == 1 + msg = msgs[0] + + assert msg.level == ERROR + assert "The selected ocr language ita is not installed" in msg.msg + + def test_multi_part_language( + self, + settings: SettingsWrapper, + mocker: MockerFixture, + ) -> None: + """ + GIVEN: + - An OCR language which is multi part (ie chi-sim) + - The language is correctly formatted + WHEN: + - Installed packages are checked + THEN: + - No errors are reported + """ + + settings.OCR_LANGUAGE = "chi_sim" + + tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs") + tesser_lang_mock.return_value = ["chi_sim", "eng"] + + msgs = check_default_language_available(None) + + assert len(msgs) == 0 + + def test_multi_part_language_bad_format( + self, + settings: SettingsWrapper, + mocker: MockerFixture, + ) -> None: + """ + GIVEN: + - An OCR language which is multi part (ie chi-sim) + - The language is correctly NOT formatted + WHEN: + - Installed packages are checked + THEN: + - No errors are reported + """ + settings.OCR_LANGUAGE = "chi-sim" + + tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs") + tesser_lang_mock.return_value = ["chi_sim", "eng"] + + msgs = check_default_language_available(None) + + assert len(msgs) == 1 + msg = msgs[0] + + assert msg.level == ERROR + assert "The selected ocr language chi-sim is not installed" in msg.msg diff --git a/src/paperless_mail/apps.py b/src/paperless_mail/apps.py index dd3e71f82..1c5d656e0 100644 --- a/src/paperless_mail/apps.py +++ b/src/paperless_mail/apps.py @@ -1,18 +1,8 @@ from django.apps import AppConfig -from django.conf import settings from django.utils.translation import gettext_lazy as _ -from paperless_mail.signals import mail_consumer_declaration - class PaperlessMailConfig(AppConfig): name = "paperless_mail" verbose_name = _("Paperless mail") - - def ready(self) -> None: - from documents.signals import document_consumer_declaration - - if settings.TIKA_ENABLED: - document_consumer_declaration.connect(mail_consumer_declaration) - AppConfig.ready(self) diff --git a/src/paperless_mail/signals.py b/src/paperless_mail/signals.py deleted file mode 100644 index 8fe046393..000000000 --- a/src/paperless_mail/signals.py +++ /dev/null @@ -1,19 +0,0 @@ -def get_parser(*args, **kwargs): - from paperless.parsers.mail import MailDocumentParser - - # MailDocumentParser accepts no constructor args in the new-style protocol. - # Pop legacy args that arrive from the signal-based consumer path. - # Phase 4 will replace this signal path with the ParserRegistry. - kwargs.pop("logging_group", None) - kwargs.pop("progress_callback", None) - return MailDocumentParser() - - -def mail_consumer_declaration(sender, **kwargs): - return { - "parser": get_parser, - "weight": 20, - "mime_types": { - "message/rfc822": ".eml", - }, - } diff --git a/src/paperless_remote/__init__.py b/src/paperless_remote/__init__.py deleted file mode 100644 index 5380ea5ac..000000000 --- a/src/paperless_remote/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# this is here so that django finds the checks. -from paperless_remote.checks import check_remote_parser_configured - -__all__ = ["check_remote_parser_configured"] diff --git a/src/paperless_remote/apps.py b/src/paperless_remote/apps.py deleted file mode 100644 index 1997b0ae9..000000000 --- a/src/paperless_remote/apps.py +++ /dev/null @@ -1,14 +0,0 @@ -from django.apps import AppConfig - -from paperless_remote.signals import remote_consumer_declaration - - -class PaperlessRemoteParserConfig(AppConfig): - name = "paperless_remote" - - def ready(self) -> None: - from documents.signals import document_consumer_declaration - - document_consumer_declaration.connect(remote_consumer_declaration) - - AppConfig.ready(self) diff --git a/src/paperless_remote/checks.py b/src/paperless_remote/checks.py deleted file mode 100644 index b9abb0592..000000000 --- a/src/paperless_remote/checks.py +++ /dev/null @@ -1,17 +0,0 @@ -from django.conf import settings -from django.core.checks import Error -from django.core.checks import register - - -@register() -def check_remote_parser_configured(app_configs, **kwargs): - if settings.REMOTE_OCR_ENGINE == "azureai" and not ( - settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY - ): - return [ - Error( - "Azure AI remote parser requires endpoint and API key to be configured.", - ), - ] - - return [] diff --git a/src/paperless_remote/signals.py b/src/paperless_remote/signals.py deleted file mode 100644 index 2300be760..000000000 --- a/src/paperless_remote/signals.py +++ /dev/null @@ -1,38 +0,0 @@ -from __future__ import annotations - -from typing import Any - - -def get_parser(*args: Any, **kwargs: Any) -> Any: - from paperless.parsers.remote import RemoteDocumentParser - - # The new RemoteDocumentParser does not accept the progress_callback - # kwarg injected by the old signal-based consumer. logging_group is - # forwarded as a positional arg. - # Phase 4 will replace this signal path with the new ParserRegistry. - kwargs.pop("progress_callback", None) - return RemoteDocumentParser(*args, **kwargs) - - -def get_supported_mime_types() -> dict[str, str]: - from django.conf import settings - - from paperless.parsers.remote import RemoteDocumentParser - from paperless.parsers.remote import RemoteEngineConfig - - config = RemoteEngineConfig( - engine=settings.REMOTE_OCR_ENGINE, - api_key=settings.REMOTE_OCR_API_KEY, - endpoint=settings.REMOTE_OCR_ENDPOINT, - ) - if not config.engine_is_valid(): - return {} - return RemoteDocumentParser.supported_mime_types() - - -def remote_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]: - return { - "parser": get_parser, - "weight": 5, - "mime_types": get_supported_mime_types(), - } diff --git a/src/paperless_remote/tests/__init__.py b/src/paperless_remote/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/paperless_remote/tests/test_checks.py b/src/paperless_remote/tests/test_checks.py deleted file mode 100644 index 0512fb257..000000000 --- a/src/paperless_remote/tests/test_checks.py +++ /dev/null @@ -1,24 +0,0 @@ -from unittest import TestCase - -from django.test import override_settings - -from paperless_remote import check_remote_parser_configured - - -class TestChecks(TestCase): - @override_settings(REMOTE_OCR_ENGINE=None) - def test_no_engine(self) -> None: - msgs = check_remote_parser_configured(None) - self.assertEqual(len(msgs), 0) - - @override_settings(REMOTE_OCR_ENGINE="azureai") - @override_settings(REMOTE_OCR_API_KEY="somekey") - @override_settings(REMOTE_OCR_ENDPOINT=None) - def test_azure_no_endpoint(self) -> None: - msgs = check_remote_parser_configured(None) - self.assertEqual(len(msgs), 1) - self.assertTrue( - msgs[0].msg.startswith( - "Azure AI remote parser requires endpoint and API key to be configured.", - ), - ) diff --git a/src/paperless_tesseract/__init__.py b/src/paperless_tesseract/__init__.py deleted file mode 100644 index cc0b886aa..000000000 --- a/src/paperless_tesseract/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# this is here so that django finds the checks. -from paperless_tesseract.checks import check_default_language_available -from paperless_tesseract.checks import get_tesseract_langs - -__all__ = ["check_default_language_available", "get_tesseract_langs"] diff --git a/src/paperless_tesseract/apps.py b/src/paperless_tesseract/apps.py deleted file mode 100644 index 8ade88400..000000000 --- a/src/paperless_tesseract/apps.py +++ /dev/null @@ -1,14 +0,0 @@ -from django.apps import AppConfig - -from paperless_tesseract.signals import tesseract_consumer_declaration - - -class PaperlessTesseractConfig(AppConfig): - name = "paperless_tesseract" - - def ready(self) -> None: - from documents.signals import document_consumer_declaration - - document_consumer_declaration.connect(tesseract_consumer_declaration) - - AppConfig.ready(self) diff --git a/src/paperless_tesseract/checks.py b/src/paperless_tesseract/checks.py deleted file mode 100644 index 0d7a1d90d..000000000 --- a/src/paperless_tesseract/checks.py +++ /dev/null @@ -1,52 +0,0 @@ -import shutil -import subprocess - -from django.conf import settings -from django.core.checks import Error -from django.core.checks import Warning -from django.core.checks import register - - -def get_tesseract_langs(): - proc = subprocess.run( - [shutil.which("tesseract"), "--list-langs"], - capture_output=True, - ) - - # Decode bytes to string, split on newlines, trim out the header - proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:] - - return [x.strip() for x in proc_lines] - - -@register() -def check_default_language_available(app_configs, **kwargs): - errs = [] - - if not settings.OCR_LANGUAGE: - errs.append( - Warning( - "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. " - "This means that tesseract will fallback to english.", - ), - ) - return errs - - # binaries_check in paperless will check and report if this doesn't exist - # So skip trying to do anything here and let that handle missing binaries - if shutil.which("tesseract") is not None: - installed_langs = get_tesseract_langs() - - specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")] - - for lang in specified_langs: - if lang not in installed_langs: - errs.append( - Error( - f"The selected ocr language {lang} is " - f"not installed. Paperless cannot OCR your documents " - f"without it. Please fix PAPERLESS_OCR_LANGUAGE.", - ), - ) - - return errs diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py deleted file mode 100644 index d80d13614..000000000 --- a/src/paperless_tesseract/signals.py +++ /dev/null @@ -1,34 +0,0 @@ -from __future__ import annotations - -from typing import Any - - -def get_parser(*args: Any, **kwargs: Any) -> Any: - from paperless.parsers.tesseract import RasterisedDocumentParser - - # RasterisedDocumentParser accepts logging_group for constructor compatibility but - # does not store or use it (no legacy DocumentParser base class). - # progress_callback is also not used. Both may arrive as a positional arg - # (consumer) or a keyword arg (views); *args absorbs the positional form, - # kwargs.pop handles the keyword form. Phase 4 will replace this signal - # path with the new ParserRegistry so the shim can be removed at that point. - kwargs.pop("logging_group", None) - kwargs.pop("progress_callback", None) - return RasterisedDocumentParser(*args, **kwargs) - - -def tesseract_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]: - return { - "parser": get_parser, - "weight": 0, - "mime_types": { - "application/pdf": ".pdf", - "image/jpeg": ".jpg", - "image/png": ".png", - "image/tiff": ".tif", - "image/gif": ".gif", - "image/bmp": ".bmp", - "image/webp": ".webp", - "image/heic": ".heic", - }, - } diff --git a/src/paperless_tesseract/tests/__init__.py b/src/paperless_tesseract/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/paperless_tesseract/tests/test_checks.py b/src/paperless_tesseract/tests/test_checks.py deleted file mode 100644 index ab3ba0c16..000000000 --- a/src/paperless_tesseract/tests/test_checks.py +++ /dev/null @@ -1,67 +0,0 @@ -from unittest import mock - -from django.core.checks import ERROR -from django.test import TestCase -from django.test import override_settings - -from paperless_tesseract import check_default_language_available - - -class TestChecks(TestCase): - def test_default_language(self) -> None: - check_default_language_available(None) - - @override_settings(OCR_LANGUAGE="") - def test_no_language(self) -> None: - msgs = check_default_language_available(None) - self.assertEqual(len(msgs), 1) - self.assertTrue( - msgs[0].msg.startswith( - "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE", - ), - ) - - @override_settings(OCR_LANGUAGE="ita") - @mock.patch("paperless_tesseract.checks.get_tesseract_langs") - def test_invalid_language(self, m) -> None: - m.return_value = ["deu", "eng"] - msgs = check_default_language_available(None) - self.assertEqual(len(msgs), 1) - self.assertEqual(msgs[0].level, ERROR) - - @override_settings(OCR_LANGUAGE="chi_sim") - @mock.patch("paperless_tesseract.checks.get_tesseract_langs") - def test_multi_part_language(self, m) -> None: - """ - GIVEN: - - An OCR language which is multi part (ie chi-sim) - - The language is correctly formatted - WHEN: - - Installed packages are checked - THEN: - - No errors are reported - """ - m.return_value = ["chi_sim", "eng"] - - msgs = check_default_language_available(None) - - self.assertEqual(len(msgs), 0) - - @override_settings(OCR_LANGUAGE="chi-sim") - @mock.patch("paperless_tesseract.checks.get_tesseract_langs") - def test_multi_part_language_bad_format(self, m) -> None: - """ - GIVEN: - - An OCR language which is multi part (ie chi-sim) - - The language is correctly NOT formatted - WHEN: - - Installed packages are checked - THEN: - - No errors are reported - """ - m.return_value = ["chi_sim", "eng"] - - msgs = check_default_language_available(None) - - self.assertEqual(len(msgs), 1) - self.assertEqual(msgs[0].level, ERROR) diff --git a/src/paperless_text/__init__.py b/src/paperless_text/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/paperless_text/apps.py b/src/paperless_text/apps.py deleted file mode 100644 index 619d71886..000000000 --- a/src/paperless_text/apps.py +++ /dev/null @@ -1,14 +0,0 @@ -from django.apps import AppConfig - -from paperless_text.signals import text_consumer_declaration - - -class PaperlessTextConfig(AppConfig): - name = "paperless_text" - - def ready(self) -> None: - from documents.signals import document_consumer_declaration - - document_consumer_declaration.connect(text_consumer_declaration) - - AppConfig.ready(self) diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py deleted file mode 100644 index 916f0a7c0..000000000 --- a/src/paperless_text/signals.py +++ /dev/null @@ -1,29 +0,0 @@ -from __future__ import annotations - -from typing import Any - - -def get_parser(*args: Any, **kwargs: Any) -> Any: - from paperless.parsers.text import TextDocumentParser - - # TextDocumentParser accepts logging_group for constructor compatibility but - # does not store or use it (no legacy DocumentParser base class). - # progress_callback is also not used. Both may arrive as a positional arg - # (consumer) or a keyword arg (views); *args absorbs the positional form, - # kwargs.pop handles the keyword form. Phase 4 will replace this signal - # path with the new ParserRegistry so the shim can be removed at that point. - kwargs.pop("logging_group", None) - kwargs.pop("progress_callback", None) - return TextDocumentParser(*args, **kwargs) - - -def text_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]: - return { - "parser": get_parser, - "weight": 10, - "mime_types": { - "text/plain": ".txt", - "text/csv": ".csv", - "application/csv": ".csv", - }, - } diff --git a/src/paperless_text/tests/__init__.py b/src/paperless_text/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/paperless_tika/__init__.py b/src/paperless_tika/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/paperless_tika/apps.py b/src/paperless_tika/apps.py deleted file mode 100644 index 714a05188..000000000 --- a/src/paperless_tika/apps.py +++ /dev/null @@ -1,15 +0,0 @@ -from django.apps import AppConfig -from django.conf import settings - -from paperless_tika.signals import tika_consumer_declaration - - -class PaperlessTikaConfig(AppConfig): - name = "paperless_tika" - - def ready(self) -> None: - from documents.signals import document_consumer_declaration - - if settings.TIKA_ENABLED: - document_consumer_declaration.connect(tika_consumer_declaration) - AppConfig.ready(self) diff --git a/src/paperless_tika/signals.py b/src/paperless_tika/signals.py deleted file mode 100644 index f1fd17ef6..000000000 --- a/src/paperless_tika/signals.py +++ /dev/null @@ -1,33 +0,0 @@ -def get_parser(*args, **kwargs): - from paperless.parsers.tika import TikaDocumentParser - - # TikaDocumentParser accepts logging_group for constructor compatibility but - # does not store or use it (no legacy DocumentParser base class). - # progress_callback is also not used. Both may arrive as a positional arg - # (consumer) or a keyword arg (views); *args absorbs the positional form, - # kwargs.pop handles the keyword form. Phase 4 will replace this signal - # path with the new ParserRegistry so the shim can be removed at that point. - kwargs.pop("logging_group", None) - kwargs.pop("progress_callback", None) - return TikaDocumentParser() - - -def tika_consumer_declaration(sender, **kwargs): - return { - "parser": get_parser, - "weight": 10, - "mime_types": { - "application/msword": ".doc", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", - "application/vnd.ms-excel": ".xls", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", - "application/vnd.ms-powerpoint": ".ppt", - "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", - "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx", - "application/vnd.oasis.opendocument.presentation": ".odp", - "application/vnd.oasis.opendocument.spreadsheet": ".ods", - "application/vnd.oasis.opendocument.text": ".odt", - "application/vnd.oasis.opendocument.graphics": ".odg", - "text/rtf": ".rtf", - }, - }