mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-12 01:25:23 +00:00
refactor: switch consumer and callers to ParserRegistry (Phase 4)
Replace all Django signal-based parser discovery with direct registry calls. Removes `_parser_cleanup`, `parser_is_new_style` shims, and all old-style isinstance checks. All parser instantiation now uses the `with parser_class() as parser:` context manager pattern. - documents/parsers.py: delegate to get_parser_registry(); drop lru_cache - documents/consumer.py: use registry + context manager; remove shims - documents/tasks.py: same pattern - documents/management/commands/document_thumbnails.py: same pattern - documents/views.py: get_metadata uses context manager - documents/checks.py: use get_parser_registry().all_parsers() - paperless/parsers/registry.py: add all_parsers() public method - tests: update mocks to target documents.consumer.get_parser_class_for_mime_type Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,25 +3,20 @@ from django.core.checks import Error
|
||||
from django.core.checks import Warning
|
||||
from django.core.checks import register
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.templating.utils import convert_format_str_to_template_format
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
|
||||
@register()
|
||||
def parser_check(app_configs, **kwargs):
|
||||
parsers = []
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parsers.append(response[1])
|
||||
|
||||
if len(parsers) == 0:
|
||||
if not get_parser_registry().all_parsers():
|
||||
return [
|
||||
Error(
|
||||
"No parsers found. This is a bug. The consumer won't be "
|
||||
"able to consume any documents without parsers.",
|
||||
),
|
||||
]
|
||||
else:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
@register()
|
||||
|
||||
+228
-296
@@ -32,7 +32,6 @@ from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.permissions import set_permissions_for_object
|
||||
@@ -52,40 +51,11 @@ from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless.parsers import ParserProtocol
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
|
||||
|
||||
def _parser_cleanup(parser: DocumentParser) -> None:
|
||||
"""
|
||||
Call cleanup on a parser, handling the new-style context-manager parsers.
|
||||
|
||||
New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown
|
||||
instead of a cleanup() method. This shim will be removed once all existing parsers
|
||||
have switched to the new style and this consumer is updated to use it
|
||||
|
||||
TODO(stumpylog): Remove me in the future
|
||||
"""
|
||||
if isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class WorkflowTriggerPlugin(
|
||||
NoCleanupPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
@@ -422,7 +392,7 @@ class ConsumerPlugin(
|
||||
self.log.error(f"Error attempting to clean PDF: {e}")
|
||||
|
||||
# Based on the mime type, get the parser for that type
|
||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
||||
parser_class: type[ParserProtocol] | None = get_parser_class_for_mime_type(
|
||||
mime_type,
|
||||
)
|
||||
if not parser_class:
|
||||
@@ -446,313 +416,275 @@ class ConsumerPlugin(
|
||||
tempdir.cleanup()
|
||||
raise
|
||||
|
||||
def progress_callback(
|
||||
current_progress,
|
||||
max_progress,
|
||||
) -> None: # pragma: no cover
|
||||
# recalculate progress to be within 20 and 80
|
||||
p = int((current_progress / max_progress) * 50 + 20)
|
||||
self._send_progress(p, 100, ProgressStatusOptions.WORKING)
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser: DocumentParser = parser_class(
|
||||
self.logging_group,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
document_parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# New-style parsers use __enter__/__exit__ for resource management.
|
||||
# _parser_cleanup (below) handles __exit__; call __enter__ here.
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
if parser_is_new_style:
|
||||
document_parser.__enter__()
|
||||
|
||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
with parser_class() as document_parser:
|
||||
document_parser.configure(
|
||||
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
document_parser.configure(
|
||||
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||
)
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
else:
|
||||
document_parser.parse(self.working_copy, mime_type, self.filename)
|
||||
self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}")
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
else:
|
||||
thumbnail = document_parser.get_thumbnail(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
self.filename,
|
||||
)
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
90,
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
)
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
except ParseError as e:
|
||||
_parser_cleanup(document_parser)
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
_parser_cleanup(document_parser)
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
|
||||
# Prepare the document classifier.
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
if self.input_doc.root_document_id:
|
||||
# If this is a new version of an existing document, we need
|
||||
# to make sure we're not creating a new document, but updating
|
||||
# the existing one.
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
self._send_progress(
|
||||
90,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
actor = None
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
)
|
||||
|
||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||
if (
|
||||
settings.AUDIT_LOG_ENABLED
|
||||
and self.metadata.actor_id is not None
|
||||
):
|
||||
actor = User.objects.filter(pk=self.metadata.actor_id).first()
|
||||
if actor is not None:
|
||||
from auditlog.context import ( # type: ignore[import-untyped]
|
||||
set_actor,
|
||||
)
|
||||
except ParseError as e:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
with set_actor(actor):
|
||||
# Prepare the document classifier.
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
if self.input_doc.root_document_id:
|
||||
# If this is a new version of an existing document, we need
|
||||
# to make sure we're not creating a new document, but updating
|
||||
# the existing one.
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
actor = None
|
||||
|
||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||
if (
|
||||
settings.AUDIT_LOG_ENABLED
|
||||
and self.metadata.actor_id is not None
|
||||
):
|
||||
actor = User.objects.filter(
|
||||
pk=self.metadata.actor_id,
|
||||
).first()
|
||||
if actor is not None:
|
||||
from auditlog.context import ( # type: ignore[import-untyped]
|
||||
set_actor,
|
||||
)
|
||||
|
||||
with set_actor(actor):
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
|
||||
# Create a log entry for the version addition, if enabled
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import ( # type: ignore[import-untyped]
|
||||
LogEntry,
|
||||
)
|
||||
|
||||
LogEntry.objects.log_create(
|
||||
instance=root_doc,
|
||||
changes={
|
||||
"Version Added": ["None", original_document.id],
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
actor=actor,
|
||||
additional_data={
|
||||
"reason": "Version added",
|
||||
"version_id": original_document.id,
|
||||
},
|
||||
)
|
||||
document = original_document
|
||||
else:
|
||||
original_document.save()
|
||||
|
||||
# Create a log entry for the version addition, if enabled
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import ( # type: ignore[import-untyped]
|
||||
LogEntry,
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
|
||||
LogEntry.objects.log_create(
|
||||
instance=root_doc,
|
||||
changes={
|
||||
"Version Added": ["None", original_document.id],
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
actor=actor,
|
||||
additional_data={
|
||||
"reason": "Version added",
|
||||
"version_id": original_document.id,
|
||||
},
|
||||
)
|
||||
document = original_document
|
||||
else:
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
)
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
generated_filename = generate_unique_filename(document)
|
||||
if (
|
||||
len(str(generated_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_filename = generate_filename(
|
||||
document,
|
||||
use_format=False,
|
||||
)
|
||||
document.filename = generated_filename
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
self._write(
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
generated_archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
generated_filename = generate_unique_filename(document)
|
||||
if (
|
||||
len(str(generated_archive_filename))
|
||||
len(str(generated_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_archive_filename = generate_filename(
|
||||
generated_filename = generate_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
use_format=False,
|
||||
)
|
||||
document.archive_filename = generated_archive_filename
|
||||
create_source_path_directory(document.archive_path)
|
||||
document.filename = generated_filename
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
self._write(
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
generated_archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
if (
|
||||
len(str(generated_archive_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_archive_filename = generate_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
use_format=False,
|
||||
)
|
||||
document.archive_filename = generated_archive_filename
|
||||
create_source_path_directory(document.archive_path)
|
||||
self._write(
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
)
|
||||
|
||||
if document.root_document_id:
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
document=document.root_document,
|
||||
)
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(f"Deleting original file {self.input_doc.original_file}")
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
|
||||
if document.root_document_id:
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
document=document.root_document,
|
||||
)
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
f"Deleting original file {self.input_doc.original_file}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
finally:
|
||||
_parser_cleanup(document_parser)
|
||||
tempdir.cleanup()
|
||||
finally:
|
||||
tempdir.cleanup()
|
||||
|
||||
self.run_post_consume_script(document)
|
||||
|
||||
|
||||
@@ -4,11 +4,6 @@ import shutil
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.models import Document
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
logger = logging.getLogger("paperless.management.thumbnails")
|
||||
|
||||
@@ -25,40 +20,9 @@ def _process_document(doc_id: int) -> None:
|
||||
)
|
||||
return
|
||||
|
||||
parser = parser_class(logging_group=None)
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.__enter__()
|
||||
|
||||
try:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||
else:
|
||||
thumb = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
document.mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
with parser_class() as parser:
|
||||
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||
shutil.move(thumb, document.thumbnail_path)
|
||||
finally:
|
||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
||||
if parser_is_new_style:
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
|
||||
+15
-41
@@ -7,20 +7,21 @@ import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
|
||||
from paperless.parsers import ParserProtocol
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
@@ -52,7 +53,6 @@ DATE_REGEX = re.compile(
|
||||
logger = logging.getLogger("paperless.parsing")
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def is_mime_type_supported(mime_type: str) -> bool:
|
||||
"""
|
||||
Returns True if the mime type is supported, False otherwise
|
||||
@@ -60,27 +60,21 @@ def is_mime_type_supported(mime_type: str) -> bool:
|
||||
return get_parser_class_for_mime_type(mime_type) is not None
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def get_default_file_extension(mime_type: str) -> str:
|
||||
"""
|
||||
Returns the default file extension for a mimetype, or
|
||||
an empty string if it could not be determined
|
||||
"""
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
return supported_mime_types[mime_type]
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if parser_class is not None:
|
||||
supported = parser_class.supported_mime_types()
|
||||
if mime_type in supported:
|
||||
return supported[mime_type]
|
||||
|
||||
ext = mimetypes.guess_extension(mime_type)
|
||||
if ext:
|
||||
return ext
|
||||
else:
|
||||
return ""
|
||||
return ext if ext else ""
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def is_file_ext_supported(ext: str) -> bool:
|
||||
"""
|
||||
Returns True if the file extension is supported, False otherwise
|
||||
@@ -94,42 +88,22 @@ def is_file_ext_supported(ext: str) -> bool:
|
||||
|
||||
def get_supported_file_extensions() -> set[str]:
|
||||
extensions = set()
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
for mime_type in supported_mime_types:
|
||||
for parser_class in get_parser_registry().all_parsers():
|
||||
for mime_type, ext in parser_class.supported_mime_types().items():
|
||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||
# Python's stdlib might be behind, so also add what the parser
|
||||
# says is the default extension
|
||||
# This makes image/webp supported on Python < 3.11
|
||||
extensions.add(supported_mime_types[mime_type])
|
||||
extensions.add(ext)
|
||||
|
||||
return extensions
|
||||
|
||||
|
||||
def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
|
||||
def get_parser_class_for_mime_type(mime_type: str) -> type[ParserProtocol] | None:
|
||||
"""
|
||||
Returns the best parser (by weight) for the given mimetype or
|
||||
None if no parser exists
|
||||
Returns the best parser for the given mimetype or None if no parser exists.
|
||||
"""
|
||||
|
||||
options = []
|
||||
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
options.append(parser_declaration)
|
||||
|
||||
if not options:
|
||||
return None
|
||||
|
||||
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
|
||||
|
||||
# Return the parser with the highest weight.
|
||||
return best_parser["parser"]
|
||||
return get_parser_registry().get_parser_for_file(mime_type, "")
|
||||
|
||||
|
||||
def run_convert(
|
||||
|
||||
+77
-131
@@ -52,7 +52,6 @@ from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import WorkflowRun
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import ProgressManager
|
||||
@@ -66,11 +65,6 @@ from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
from paperless_ai.indexing import llm_index_remove_document
|
||||
from paperless_ai.indexing import update_llm_index
|
||||
@@ -310,9 +304,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
|
||||
mime_type = document.mime_type
|
||||
|
||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
||||
mime_type,
|
||||
)
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
|
||||
if not parser_class:
|
||||
logger.error(
|
||||
@@ -321,138 +313,92 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
)
|
||||
return
|
||||
|
||||
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
|
||||
with parser_class() as parser:
|
||||
parser.configure(ParserContext())
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.__enter__()
|
||||
|
||||
try:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.configure(ParserContext())
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type)
|
||||
else:
|
||||
parser.parse(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||
else:
|
||||
thumbnail = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text(),
|
||||
archive_filename=document.archive_filename,
|
||||
)
|
||||
newDocument = Document.objects.get(pk=document.pk)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, newDocument.content],
|
||||
"archive_checksum": [
|
||||
oldDocument.archive_checksum,
|
||||
newDocument.archive_checksum,
|
||||
],
|
||||
"archive_filename": [
|
||||
oldDocument.archive_filename,
|
||||
newDocument.archive_filename,
|
||||
],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
else:
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
content=parser.get_text(),
|
||||
)
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, parser.get_text()],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
shutil.move(thumbnail, document.thumbnail_path)
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text(),
|
||||
archive_filename=document.archive_filename,
|
||||
)
|
||||
newDocument = Document.objects.get(pk=document.pk)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, newDocument.content],
|
||||
"archive_checksum": [
|
||||
oldDocument.archive_checksum,
|
||||
newDocument.archive_checksum,
|
||||
],
|
||||
"archive_filename": [
|
||||
oldDocument.archive_filename,
|
||||
newDocument.archive_filename,
|
||||
],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
else:
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
content=parser.get_text(),
|
||||
)
|
||||
|
||||
document.refresh_from_db()
|
||||
logger.info(
|
||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, parser.get_text()],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
llm_index_add_or_update_document(document)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
if parser.get_archive_path():
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
shutil.move(thumbnail, document.thumbnail_path)
|
||||
|
||||
clear_document_caches(document.pk)
|
||||
document.refresh_from_db()
|
||||
logger.info(
|
||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Error while parsing document {document} (ID: {document_id})",
|
||||
)
|
||||
finally:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
llm_index_add_or_update_document(document)
|
||||
|
||||
clear_document_caches(document.pk)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Error while parsing document {document} (ID: {document_id})",
|
||||
)
|
||||
|
||||
|
||||
@shared_task
|
||||
|
||||
@@ -13,8 +13,10 @@ class TestDocumentChecks(TestCase):
|
||||
def test_parser_check(self) -> None:
|
||||
self.assertEqual(parser_check(None), [])
|
||||
|
||||
with mock.patch("documents.checks.document_consumer_declaration.send") as m:
|
||||
m.return_value = []
|
||||
with mock.patch("documents.checks.get_parser_registry") as mock_registry_fn:
|
||||
mock_registry = mock.MagicMock()
|
||||
mock_registry.all_parsers.return_value = []
|
||||
mock_registry_fn.return_value = mock_registry
|
||||
|
||||
self.assertEqual(
|
||||
parser_check(None),
|
||||
|
||||
@@ -27,7 +27,6 @@ from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.tasks import sanity_check
|
||||
@@ -38,62 +37,104 @@ from documents.tests.utils import GetConsumerMixin
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
|
||||
class _BaseTestParser(DocumentParser):
|
||||
def get_settings(self) -> None:
|
||||
"""
|
||||
This parser does not implement additional settings yet
|
||||
"""
|
||||
class _BaseNewStyleParser:
|
||||
"""Minimal ParserProtocol implementation for use in consumer tests."""
|
||||
|
||||
name: str = "test-parser"
|
||||
version: str = "0.1"
|
||||
author: str = "test"
|
||||
url: str = "test"
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict:
|
||||
return {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"message/rfc822": ".eml",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def score(cls, mime_type: str, filename: str, path=None):
|
||||
return 0 if mime_type in cls.supported_mime_types() else None
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
return False
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._tmpdir: Path | None = None
|
||||
self._text: str | None = None
|
||||
self._archive: Path | None = None
|
||||
self._thumb: Path | None = None
|
||||
|
||||
def __enter__(self):
|
||||
self._tmpdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-test-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
_, thumb = tempfile.mkstemp(suffix=".webp", dir=self._tmpdir)
|
||||
self._thumb = Path(thumb)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||
if self._tmpdir and self._tmpdir.exists():
|
||||
shutil.rmtree(self._tmpdir, ignore_errors=True)
|
||||
|
||||
def configure(self, context) -> None:
|
||||
pass
|
||||
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
return self._text
|
||||
|
||||
def get_date(self):
|
||||
return None
|
||||
|
||||
def get_archive_path(self):
|
||||
return self._archive
|
||||
|
||||
class DummyParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir, archive_path) -> None:
|
||||
super().__init__(logging_group, None)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
self.archive_path = archive_path
|
||||
def get_thumbnail(self, document_path, mime_type) -> Path:
|
||||
return self._thumb
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
def get_page_count(self, document_path, mime_type):
|
||||
return None
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
||||
self.text = "The Text"
|
||||
def extract_metadata(self, document_path, mime_type) -> list:
|
||||
return []
|
||||
|
||||
|
||||
class CopyParser(_BaseTestParser):
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
class DummyParser(_BaseNewStyleParser):
|
||||
_ARCHIVE_SRC = (
|
||||
Path(__file__).parent / "samples" / "documents" / "archive" / "0000001.pdf"
|
||||
)
|
||||
|
||||
def __init__(self, logging_group, progress_callback=None) -> None:
|
||||
super().__init__(logging_group, progress_callback)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir)
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
||||
self.text = "The text"
|
||||
self.archive_path = Path(self.tempdir / "archive.pdf")
|
||||
shutil.copy(document_path, self.archive_path)
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
self._text = "The Text"
|
||||
if produce_archive and self._tmpdir:
|
||||
self._archive = self._tmpdir / "archive.pdf"
|
||||
shutil.copy(self._ARCHIVE_SRC, self._archive)
|
||||
|
||||
|
||||
class FaultyParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir) -> None:
|
||||
super().__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
class CopyParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
self._text = "The text"
|
||||
if produce_archive and self._tmpdir:
|
||||
self._archive = self._tmpdir / "archive.pdf"
|
||||
shutil.copy(document_path, self._archive)
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
class FaultyParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
class FaultyGenericExceptionParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir) -> None:
|
||||
super().__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
class FaultyGenericExceptionParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise Exception("Generic exception.")
|
||||
|
||||
|
||||
@@ -147,38 +188,12 @@ class TestConsumer(
|
||||
self.assertEqual(payload["data"]["max_progress"], last_progress_max)
|
||||
self.assertEqual(payload["data"]["status"], last_status)
|
||||
|
||||
def make_dummy_parser(self, logging_group, progress_callback=None):
|
||||
return DummyParser(
|
||||
logging_group,
|
||||
self.dirs.scratch_dir,
|
||||
self.get_test_archive_file(),
|
||||
)
|
||||
|
||||
def make_faulty_parser(self, logging_group, progress_callback=None):
|
||||
return FaultyParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def make_faulty_generic_exception_parser(
|
||||
self,
|
||||
logging_group,
|
||||
progress_callback=None,
|
||||
):
|
||||
return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
|
||||
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
patcher = mock.patch("documents.consumer.get_parser_class_for_mime_type")
|
||||
m = patcher.start()
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_dummy_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value = DummyParser
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
def get_test_file(self):
|
||||
@@ -547,9 +562,9 @@ class TestConsumer(
|
||||
) as consumer:
|
||||
consumer.run()
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_class_for_mime_type")
|
||||
def testNoParsers(self, m) -> None:
|
||||
m.return_value = []
|
||||
m.return_value = None
|
||||
|
||||
with self.assertRaisesMessage(
|
||||
ConsumerError,
|
||||
@@ -560,18 +575,9 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_class_for_mime_type")
|
||||
def testFaultyParser(self, m) -> None:
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_faulty_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value = FaultyParser
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
with self.assertRaisesMessage(
|
||||
@@ -582,18 +588,9 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_class_for_mime_type")
|
||||
def testGenericParserException(self, m) -> None:
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_faulty_generic_exception_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value = FaultyGenericExceptionParser
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
with self.assertRaisesMessage(
|
||||
@@ -1017,7 +1014,7 @@ class TestConsumer(
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{title}")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_class_for_mime_type")
|
||||
def test_similar_filenames(self, m) -> None:
|
||||
shutil.copy(
|
||||
Path(__file__).parent / "samples" / "simple.pdf",
|
||||
@@ -1031,16 +1028,7 @@ class TestConsumer(
|
||||
Path(__file__).parent / "samples" / "simple-noalpha.png",
|
||||
settings.CONSUMPTION_DIR / "simple.png.pdf",
|
||||
)
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": CopyParser,
|
||||
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value = CopyParser
|
||||
|
||||
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer:
|
||||
consumer.run()
|
||||
@@ -1068,8 +1056,9 @@ class TestConsumer(
|
||||
|
||||
sanity_check()
|
||||
|
||||
@mock.patch("documents.consumer.get_parser_class_for_mime_type", return_value=None)
|
||||
@mock.patch("documents.consumer.run_subprocess")
|
||||
def test_try_to_clean_invalid_pdf(self, m) -> None:
|
||||
def test_try_to_clean_invalid_pdf(self, m, _) -> None:
|
||||
shutil.copy(
|
||||
Path(__file__).parent / "samples" / "invalid_pdf.pdf",
|
||||
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
|
||||
@@ -1091,10 +1080,10 @@ class TestConsumer(
|
||||
|
||||
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
||||
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_class_for_mime_type")
|
||||
def test_mail_parser_receives_mailrule(
|
||||
self,
|
||||
mock_consumer_declaration_send: mock.Mock,
|
||||
mock_get_parser_class: mock.Mock,
|
||||
mock_mail_parser_parse: mock.Mock,
|
||||
mock_mailrule_get: mock.Mock,
|
||||
) -> None:
|
||||
@@ -1106,18 +1095,9 @@ class TestConsumer(
|
||||
THEN:
|
||||
- The mail parser should receive the mail rule
|
||||
"""
|
||||
from paperless_mail.signals import get_parser as mail_get_parser
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
mock_consumer_declaration_send.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": mail_get_parser,
|
||||
"mime_types": {"message/rfc822": ".eml"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
mock_get_parser_class.return_value = MailDocumentParser
|
||||
mock_mailrule_get.return_value = mock.Mock(
|
||||
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
|
||||
)
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
from django.apps import apps
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
@@ -9,14 +7,26 @@ from documents.parsers import get_default_file_extension
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import get_supported_file_extensions
|
||||
from documents.parsers import is_file_ext_supported
|
||||
from paperless.parsers.registry import reset_parser_registry
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
|
||||
class TestParserDiscovery(TestCase):
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_1_parser(self, m, *args) -> None:
|
||||
def _mock_registry(self, return_value):
|
||||
"""Return a context manager that patches get_parser_registry."""
|
||||
mock_registry = mock.MagicMock()
|
||||
mock_registry.get_parser_for_file.return_value = return_value
|
||||
mock_registry.all_parsers.return_value = (
|
||||
[return_value] if return_value is not None else []
|
||||
)
|
||||
return mock.patch(
|
||||
"documents.parsers.get_parser_registry",
|
||||
return_value=mock_registry,
|
||||
)
|
||||
|
||||
def test_get_parser_class_1_parser(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Parser declared for a given mimetype
|
||||
@@ -29,63 +39,33 @@ class TestParserDiscovery(TestCase):
|
||||
class DummyParser:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
with self._mock_registry(DummyParser):
|
||||
self.assertEqual(
|
||||
get_parser_class_for_mime_type("application/pdf"),
|
||||
DummyParser,
|
||||
)
|
||||
|
||||
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_n_parsers(self, m, *args) -> None:
|
||||
def test_get_parser_class_n_parsers(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Two parsers declared for a given mimetype
|
||||
- Second parser has a higher weight
|
||||
- Two parsers declared for a given mimetype, second has higher score
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- Second parser class is returned
|
||||
- Second (higher-score) parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser1:
|
||||
pass
|
||||
|
||||
class DummyParser2:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser1,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 1,
|
||||
"parser": DummyParser2,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
# The registry already handles scoring; get_parser_for_file returns the winner.
|
||||
with self._mock_registry(DummyParser2):
|
||||
self.assertEqual(
|
||||
get_parser_class_for_mime_type("application/pdf"),
|
||||
DummyParser2,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
get_parser_class_for_mime_type("application/pdf"),
|
||||
DummyParser2,
|
||||
)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_0_parsers(self, m, *args) -> None:
|
||||
def test_get_parser_class_0_parsers(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- No parsers are declared
|
||||
@@ -94,37 +74,20 @@ class TestParserDiscovery(TestCase):
|
||||
THEN:
|
||||
- No parser class is returned
|
||||
"""
|
||||
m.return_value = []
|
||||
with TemporaryDirectory():
|
||||
with self._mock_registry(None):
|
||||
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_no_valid_parser(self, m, *args) -> None:
|
||||
def test_get_parser_class_no_valid_parser(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- No parser declared for a given mimetype
|
||||
- Parser declared for a different mimetype
|
||||
WHEN:
|
||||
- Attempt to get parser for the given mimetype
|
||||
THEN:
|
||||
- No parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
|
||||
with self._mock_registry(None):
|
||||
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
|
||||
|
||||
|
||||
class TestParserAvailability(TestCase):
|
||||
@@ -151,7 +114,7 @@ class TestParserAvailability(TestCase):
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
get_parser_class_for_mime_type(mime_type)(),
|
||||
RasterisedDocumentParser,
|
||||
)
|
||||
|
||||
@@ -175,7 +138,7 @@ class TestParserAvailability(TestCase):
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
get_parser_class_for_mime_type(mime_type)(),
|
||||
TextDocumentParser,
|
||||
)
|
||||
|
||||
@@ -198,19 +161,20 @@ class TestParserAvailability(TestCase):
|
||||
),
|
||||
]
|
||||
|
||||
# Force the app ready to notice the settings override
|
||||
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
|
||||
app = apps.get_app_config("paperless_tika")
|
||||
app.ready()
|
||||
self.addCleanup(reset_parser_registry)
|
||||
|
||||
# Reset and rebuild the registry with Tika enabled.
|
||||
with override_settings(TIKA_ENABLED=True):
|
||||
reset_parser_registry()
|
||||
supported_exts = get_supported_file_extensions()
|
||||
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
TikaDocumentParser,
|
||||
)
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(),
|
||||
TikaDocumentParser,
|
||||
)
|
||||
|
||||
def test_no_parser_for_mime(self) -> None:
|
||||
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
||||
|
||||
@@ -7,7 +7,6 @@ import tempfile
|
||||
import zipfile
|
||||
from collections import defaultdict
|
||||
from collections import deque
|
||||
from contextlib import nullcontext
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from time import mktime
|
||||
@@ -226,7 +225,6 @@ from paperless.celery import app as celery_app
|
||||
from paperless.config import AIConfig
|
||||
from paperless.config import GeneralConfig
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.serialisers import GroupSerializer
|
||||
from paperless.serialisers import UserSerializer
|
||||
from paperless.views import StandardPagination
|
||||
@@ -1085,15 +1083,11 @@ class DocumentViewSet(
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if parser_class:
|
||||
parser = parser_class(progress_callback=None, logging_group=None)
|
||||
cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
|
||||
|
||||
try:
|
||||
with cm:
|
||||
with parser_class() as parser:
|
||||
return parser.extract_metadata(file, mime_type)
|
||||
except Exception: # pragma: no cover
|
||||
logger.exception(f"Issue getting metadata for {file}")
|
||||
# TODO: cover GPG errors, remove later.
|
||||
return []
|
||||
else: # pragma: no cover
|
||||
logger.warning(f"No parser for {mime_type}")
|
||||
|
||||
@@ -304,6 +304,23 @@ class ParserRegistry:
|
||||
getattr(cls, "url", "unknown"),
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Inspection helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def all_parsers(self) -> list[type[ParserProtocol]]:
|
||||
"""Return all registered parser classes (external first, then builtins).
|
||||
|
||||
Used by compatibility wrappers that need to iterate every parser to
|
||||
compute the full set of supported MIME types and file extensions.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[type[ParserProtocol]]
|
||||
External parsers followed by built-in parsers.
|
||||
"""
|
||||
return [*self._external, *self._builtins]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Parser resolution
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user