mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-21 00:15:57 +00:00
Compare commits
9 Commits
dev
...
feature-dr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3c60003635 | ||
|
|
854406c118 | ||
|
|
eb3401725c | ||
|
|
36ce9218ec | ||
|
|
a806280c1b | ||
|
|
2c1690c891 | ||
|
|
6640968064 | ||
|
|
f86ddcf221 | ||
|
|
c094e91567 |
@@ -2437,17 +2437,3 @@ src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "Non
|
|||||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
||||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
||||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "user_args" [union-attr]
|
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "user_args" [union-attr]
|
||||||
src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
|
||||||
src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
|
||||||
src/paperless_text/parsers.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "None") [assignment]
|
|
||||||
src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Argument 1 to "make_thumbnail_from_pdf" has incompatible type "None"; expected "Path" [arg-type]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Function is missing a return type annotation [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "None") [assignment]
|
|
||||||
src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
|
|||||||
@@ -269,10 +269,6 @@ testpaths = [
|
|||||||
"src/documents/tests/",
|
"src/documents/tests/",
|
||||||
"src/paperless/tests/",
|
"src/paperless/tests/",
|
||||||
"src/paperless_mail/tests/",
|
"src/paperless_mail/tests/",
|
||||||
"src/paperless_tesseract/tests/",
|
|
||||||
"src/paperless_tika/tests",
|
|
||||||
"src/paperless_text/tests/",
|
|
||||||
"src/paperless_remote/tests/",
|
|
||||||
"src/paperless_ai/tests",
|
"src/paperless_ai/tests",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -3,25 +3,20 @@ from django.core.checks import Error
|
|||||||
from django.core.checks import Warning
|
from django.core.checks import Warning
|
||||||
from django.core.checks import register
|
from django.core.checks import register
|
||||||
|
|
||||||
from documents.signals import document_consumer_declaration
|
|
||||||
from documents.templating.utils import convert_format_str_to_template_format
|
from documents.templating.utils import convert_format_str_to_template_format
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
|
|
||||||
@register()
|
@register()
|
||||||
def parser_check(app_configs, **kwargs):
|
def parser_check(app_configs, **kwargs):
|
||||||
parsers = []
|
if not get_parser_registry().all_parsers():
|
||||||
for response in document_consumer_declaration.send(None):
|
|
||||||
parsers.append(response[1])
|
|
||||||
|
|
||||||
if len(parsers) == 0:
|
|
||||||
return [
|
return [
|
||||||
Error(
|
Error(
|
||||||
"No parsers found. This is a bug. The consumer won't be "
|
"No parsers found. This is a bug. The consumer won't be "
|
||||||
"able to consume any documents without parsers.",
|
"able to consume any documents without parsers.",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
else:
|
return []
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
@register()
|
@register()
|
||||||
|
|||||||
@@ -32,9 +32,7 @@ from documents.models import DocumentType
|
|||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
|
||||||
from documents.permissions import set_permissions_for_object
|
from documents.permissions import set_permissions_for_object
|
||||||
from documents.plugins.base import AlwaysRunPluginMixin
|
from documents.plugins.base import AlwaysRunPluginMixin
|
||||||
from documents.plugins.base import ConsumeTaskPlugin
|
from documents.plugins.base import ConsumeTaskPlugin
|
||||||
@@ -52,40 +50,12 @@ from documents.utils import copy_basic_file_stats
|
|||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
from paperless.parsers import ParserContext
|
from paperless.parsers import ParserContext
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless.parsers import ParserProtocol
|
||||||
from paperless.parsers.remote import RemoteDocumentParser
|
from paperless.parsers.registry import get_parser_registry
|
||||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
|
||||||
from paperless.parsers.text import TextDocumentParser
|
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
|
||||||
|
|
||||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||||
|
|
||||||
|
|
||||||
def _parser_cleanup(parser: DocumentParser) -> None:
|
|
||||||
"""
|
|
||||||
Call cleanup on a parser, handling the new-style context-manager parsers.
|
|
||||||
|
|
||||||
New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown
|
|
||||||
instead of a cleanup() method. This shim will be removed once all existing parsers
|
|
||||||
have switched to the new style and this consumer is updated to use it
|
|
||||||
|
|
||||||
TODO(stumpylog): Remove me in the future
|
|
||||||
"""
|
|
||||||
if isinstance(
|
|
||||||
parser,
|
|
||||||
(
|
|
||||||
MailDocumentParser,
|
|
||||||
RasterisedDocumentParser,
|
|
||||||
RemoteDocumentParser,
|
|
||||||
TextDocumentParser,
|
|
||||||
TikaDocumentParser,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
parser.__exit__(None, None, None)
|
|
||||||
else:
|
|
||||||
parser.cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
class WorkflowTriggerPlugin(
|
class WorkflowTriggerPlugin(
|
||||||
NoCleanupPluginMixin,
|
NoCleanupPluginMixin,
|
||||||
NoSetupPluginMixin,
|
NoSetupPluginMixin,
|
||||||
@@ -422,8 +392,12 @@ class ConsumerPlugin(
|
|||||||
self.log.error(f"Error attempting to clean PDF: {e}")
|
self.log.error(f"Error attempting to clean PDF: {e}")
|
||||||
|
|
||||||
# Based on the mime type, get the parser for that type
|
# Based on the mime type, get the parser for that type
|
||||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
parser_class: type[ParserProtocol] | None = (
|
||||||
mime_type,
|
get_parser_registry().get_parser_for_file(
|
||||||
|
mime_type,
|
||||||
|
self.filename,
|
||||||
|
self.working_copy,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
if not parser_class:
|
if not parser_class:
|
||||||
tempdir.cleanup()
|
tempdir.cleanup()
|
||||||
@@ -446,313 +420,275 @@ class ConsumerPlugin(
|
|||||||
tempdir.cleanup()
|
tempdir.cleanup()
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def progress_callback(
|
|
||||||
current_progress,
|
|
||||||
max_progress,
|
|
||||||
) -> None: # pragma: no cover
|
|
||||||
# recalculate progress to be within 20 and 80
|
|
||||||
p = int((current_progress / max_progress) * 50 + 20)
|
|
||||||
self._send_progress(p, 100, ProgressStatusOptions.WORKING)
|
|
||||||
|
|
||||||
# This doesn't parse the document yet, but gives us a parser.
|
# This doesn't parse the document yet, but gives us a parser.
|
||||||
|
with parser_class() as document_parser:
|
||||||
document_parser: DocumentParser = parser_class(
|
document_parser.configure(
|
||||||
self.logging_group,
|
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||||
progress_callback=progress_callback,
|
|
||||||
)
|
|
||||||
|
|
||||||
parser_is_new_style = isinstance(
|
|
||||||
document_parser,
|
|
||||||
(
|
|
||||||
MailDocumentParser,
|
|
||||||
RasterisedDocumentParser,
|
|
||||||
RemoteDocumentParser,
|
|
||||||
TextDocumentParser,
|
|
||||||
TikaDocumentParser,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# New-style parsers use __enter__/__exit__ for resource management.
|
|
||||||
# _parser_cleanup (below) handles __exit__; call __enter__ here.
|
|
||||||
# TODO(stumpylog): Remove me in the future
|
|
||||||
if parser_is_new_style:
|
|
||||||
document_parser.__enter__()
|
|
||||||
|
|
||||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
|
||||||
|
|
||||||
# Parse the document. This may take some time.
|
|
||||||
|
|
||||||
text = None
|
|
||||||
date = None
|
|
||||||
thumbnail = None
|
|
||||||
archive_path = None
|
|
||||||
page_count = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
self._send_progress(
|
|
||||||
20,
|
|
||||||
100,
|
|
||||||
ProgressStatusOptions.WORKING,
|
|
||||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
|
||||||
)
|
)
|
||||||
self.log.debug(f"Parsing {self.filename}...")
|
|
||||||
|
|
||||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}")
|
||||||
if parser_is_new_style:
|
|
||||||
document_parser.configure(
|
|
||||||
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
|
||||||
)
|
|
||||||
# TODO(stumpylog): Remove me in the future
|
|
||||||
document_parser.parse(self.working_copy, mime_type)
|
|
||||||
else:
|
|
||||||
document_parser.parse(self.working_copy, mime_type, self.filename)
|
|
||||||
|
|
||||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
# Parse the document. This may take some time.
|
||||||
self._send_progress(
|
|
||||||
70,
|
|
||||||
100,
|
|
||||||
ProgressStatusOptions.WORKING,
|
|
||||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
|
||||||
)
|
|
||||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
|
||||||
if parser_is_new_style:
|
|
||||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
|
||||||
else:
|
|
||||||
thumbnail = document_parser.get_thumbnail(
|
|
||||||
self.working_copy,
|
|
||||||
mime_type,
|
|
||||||
self.filename,
|
|
||||||
)
|
|
||||||
|
|
||||||
text = document_parser.get_text()
|
text = None
|
||||||
date = document_parser.get_date()
|
date = None
|
||||||
if date is None:
|
thumbnail = None
|
||||||
|
archive_path = None
|
||||||
|
page_count = None
|
||||||
|
|
||||||
|
try:
|
||||||
self._send_progress(
|
self._send_progress(
|
||||||
90,
|
20,
|
||||||
100,
|
100,
|
||||||
ProgressStatusOptions.WORKING,
|
ProgressStatusOptions.WORKING,
|
||||||
ConsumerStatusShortMessage.PARSE_DATE,
|
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||||
)
|
)
|
||||||
with get_date_parser() as date_parser:
|
self.log.debug(f"Parsing {self.filename}...")
|
||||||
date = next(date_parser.parse(self.filename, text), None)
|
|
||||||
archive_path = document_parser.get_archive_path()
|
|
||||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
|
||||||
|
|
||||||
except ParseError as e:
|
document_parser.parse(self.working_copy, mime_type)
|
||||||
_parser_cleanup(document_parser)
|
|
||||||
if tempdir:
|
|
||||||
tempdir.cleanup()
|
|
||||||
self._fail(
|
|
||||||
str(e),
|
|
||||||
f"Error occurred while consuming document {self.filename}: {e}",
|
|
||||||
exc_info=True,
|
|
||||||
exception=e,
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
_parser_cleanup(document_parser)
|
|
||||||
if tempdir:
|
|
||||||
tempdir.cleanup()
|
|
||||||
self._fail(
|
|
||||||
str(e),
|
|
||||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
|
||||||
exc_info=True,
|
|
||||||
exception=e,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prepare the document classifier.
|
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||||
|
self._send_progress(
|
||||||
|
70,
|
||||||
|
100,
|
||||||
|
ProgressStatusOptions.WORKING,
|
||||||
|
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||||
|
)
|
||||||
|
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||||
|
|
||||||
# TODO: I don't really like to do this here, but this way we avoid
|
text = document_parser.get_text()
|
||||||
# reloading the classifier multiple times, since there are multiple
|
date = document_parser.get_date()
|
||||||
# post-consume hooks that all require the classifier.
|
if date is None:
|
||||||
|
self._send_progress(
|
||||||
classifier = load_classifier()
|
90,
|
||||||
|
100,
|
||||||
self._send_progress(
|
ProgressStatusOptions.WORKING,
|
||||||
95,
|
ConsumerStatusShortMessage.PARSE_DATE,
|
||||||
100,
|
|
||||||
ProgressStatusOptions.WORKING,
|
|
||||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
|
||||||
)
|
|
||||||
# now that everything is done, we can start to store the document
|
|
||||||
# in the system. This will be a transaction and reasonably fast.
|
|
||||||
try:
|
|
||||||
with transaction.atomic():
|
|
||||||
# store the document.
|
|
||||||
if self.input_doc.root_document_id:
|
|
||||||
# If this is a new version of an existing document, we need
|
|
||||||
# to make sure we're not creating a new document, but updating
|
|
||||||
# the existing one.
|
|
||||||
root_doc = Document.objects.get(
|
|
||||||
pk=self.input_doc.root_document_id,
|
|
||||||
)
|
)
|
||||||
original_document = self._create_version_from_root(
|
with get_date_parser() as date_parser:
|
||||||
root_doc,
|
date = next(date_parser.parse(self.filename, text), None)
|
||||||
text=text,
|
archive_path = document_parser.get_archive_path()
|
||||||
page_count=page_count,
|
page_count = document_parser.get_page_count(
|
||||||
mime_type=mime_type,
|
self.working_copy,
|
||||||
)
|
mime_type,
|
||||||
actor = None
|
)
|
||||||
|
|
||||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
except ParseError as e:
|
||||||
if (
|
if tempdir:
|
||||||
settings.AUDIT_LOG_ENABLED
|
tempdir.cleanup()
|
||||||
and self.metadata.actor_id is not None
|
self._fail(
|
||||||
):
|
str(e),
|
||||||
actor = User.objects.filter(pk=self.metadata.actor_id).first()
|
f"Error occurred while consuming document {self.filename}: {e}",
|
||||||
if actor is not None:
|
exc_info=True,
|
||||||
from auditlog.context import ( # type: ignore[import-untyped]
|
exception=e,
|
||||||
set_actor,
|
)
|
||||||
)
|
except Exception as e:
|
||||||
|
if tempdir:
|
||||||
|
tempdir.cleanup()
|
||||||
|
self._fail(
|
||||||
|
str(e),
|
||||||
|
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||||
|
exc_info=True,
|
||||||
|
exception=e,
|
||||||
|
)
|
||||||
|
|
||||||
with set_actor(actor):
|
# Prepare the document classifier.
|
||||||
|
|
||||||
|
# TODO: I don't really like to do this here, but this way we avoid
|
||||||
|
# reloading the classifier multiple times, since there are multiple
|
||||||
|
# post-consume hooks that all require the classifier.
|
||||||
|
|
||||||
|
classifier = load_classifier()
|
||||||
|
|
||||||
|
self._send_progress(
|
||||||
|
95,
|
||||||
|
100,
|
||||||
|
ProgressStatusOptions.WORKING,
|
||||||
|
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||||
|
)
|
||||||
|
# now that everything is done, we can start to store the document
|
||||||
|
# in the system. This will be a transaction and reasonably fast.
|
||||||
|
try:
|
||||||
|
with transaction.atomic():
|
||||||
|
# store the document.
|
||||||
|
if self.input_doc.root_document_id:
|
||||||
|
# If this is a new version of an existing document, we need
|
||||||
|
# to make sure we're not creating a new document, but updating
|
||||||
|
# the existing one.
|
||||||
|
root_doc = Document.objects.get(
|
||||||
|
pk=self.input_doc.root_document_id,
|
||||||
|
)
|
||||||
|
original_document = self._create_version_from_root(
|
||||||
|
root_doc,
|
||||||
|
text=text,
|
||||||
|
page_count=page_count,
|
||||||
|
mime_type=mime_type,
|
||||||
|
)
|
||||||
|
actor = None
|
||||||
|
|
||||||
|
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||||
|
if (
|
||||||
|
settings.AUDIT_LOG_ENABLED
|
||||||
|
and self.metadata.actor_id is not None
|
||||||
|
):
|
||||||
|
actor = User.objects.filter(
|
||||||
|
pk=self.metadata.actor_id,
|
||||||
|
).first()
|
||||||
|
if actor is not None:
|
||||||
|
from auditlog.context import ( # type: ignore[import-untyped]
|
||||||
|
set_actor,
|
||||||
|
)
|
||||||
|
|
||||||
|
with set_actor(actor):
|
||||||
|
original_document.save()
|
||||||
|
else:
|
||||||
original_document.save()
|
original_document.save()
|
||||||
else:
|
else:
|
||||||
original_document.save()
|
original_document.save()
|
||||||
|
|
||||||
|
# Create a log entry for the version addition, if enabled
|
||||||
|
if settings.AUDIT_LOG_ENABLED:
|
||||||
|
from auditlog.models import ( # type: ignore[import-untyped]
|
||||||
|
LogEntry,
|
||||||
|
)
|
||||||
|
|
||||||
|
LogEntry.objects.log_create(
|
||||||
|
instance=root_doc,
|
||||||
|
changes={
|
||||||
|
"Version Added": ["None", original_document.id],
|
||||||
|
},
|
||||||
|
action=LogEntry.Action.UPDATE,
|
||||||
|
actor=actor,
|
||||||
|
additional_data={
|
||||||
|
"reason": "Version added",
|
||||||
|
"version_id": original_document.id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
document = original_document
|
||||||
else:
|
else:
|
||||||
original_document.save()
|
document = self._store(
|
||||||
|
text=text,
|
||||||
# Create a log entry for the version addition, if enabled
|
date=date,
|
||||||
if settings.AUDIT_LOG_ENABLED:
|
page_count=page_count,
|
||||||
from auditlog.models import ( # type: ignore[import-untyped]
|
mime_type=mime_type,
|
||||||
LogEntry,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
LogEntry.objects.log_create(
|
# If we get here, it was successful. Proceed with post-consume
|
||||||
instance=root_doc,
|
# hooks. If they fail, nothing will get changed.
|
||||||
changes={
|
|
||||||
"Version Added": ["None", original_document.id],
|
|
||||||
},
|
|
||||||
action=LogEntry.Action.UPDATE,
|
|
||||||
actor=actor,
|
|
||||||
additional_data={
|
|
||||||
"reason": "Version added",
|
|
||||||
"version_id": original_document.id,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
document = original_document
|
|
||||||
else:
|
|
||||||
document = self._store(
|
|
||||||
text=text,
|
|
||||||
date=date,
|
|
||||||
page_count=page_count,
|
|
||||||
mime_type=mime_type,
|
|
||||||
)
|
|
||||||
|
|
||||||
# If we get here, it was successful. Proceed with post-consume
|
document_consumption_finished.send(
|
||||||
# hooks. If they fail, nothing will get changed.
|
sender=self.__class__,
|
||||||
|
document=document,
|
||||||
document_consumption_finished.send(
|
logging_group=self.logging_group,
|
||||||
sender=self.__class__,
|
classifier=classifier,
|
||||||
document=document,
|
original_file=self.unmodified_original
|
||||||
logging_group=self.logging_group,
|
if self.unmodified_original
|
||||||
classifier=classifier,
|
|
||||||
original_file=self.unmodified_original
|
|
||||||
if self.unmodified_original
|
|
||||||
else self.working_copy,
|
|
||||||
)
|
|
||||||
|
|
||||||
# After everything is in the database, copy the files into
|
|
||||||
# place. If this fails, we'll also rollback the transaction.
|
|
||||||
with FileLock(settings.MEDIA_LOCK):
|
|
||||||
generated_filename = generate_unique_filename(document)
|
|
||||||
if (
|
|
||||||
len(str(generated_filename))
|
|
||||||
> Document.MAX_STORED_FILENAME_LENGTH
|
|
||||||
):
|
|
||||||
self.log.warning(
|
|
||||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
|
||||||
)
|
|
||||||
generated_filename = generate_filename(
|
|
||||||
document,
|
|
||||||
use_format=False,
|
|
||||||
)
|
|
||||||
document.filename = generated_filename
|
|
||||||
create_source_path_directory(document.source_path)
|
|
||||||
|
|
||||||
self._write(
|
|
||||||
self.unmodified_original
|
|
||||||
if self.unmodified_original is not None
|
|
||||||
else self.working_copy,
|
else self.working_copy,
|
||||||
document.source_path,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self._write(
|
# After everything is in the database, copy the files into
|
||||||
thumbnail,
|
# place. If this fails, we'll also rollback the transaction.
|
||||||
document.thumbnail_path,
|
with FileLock(settings.MEDIA_LOCK):
|
||||||
)
|
generated_filename = generate_unique_filename(document)
|
||||||
|
|
||||||
if archive_path and Path(archive_path).is_file():
|
|
||||||
generated_archive_filename = generate_unique_filename(
|
|
||||||
document,
|
|
||||||
archive_filename=True,
|
|
||||||
)
|
|
||||||
if (
|
if (
|
||||||
len(str(generated_archive_filename))
|
len(str(generated_filename))
|
||||||
> Document.MAX_STORED_FILENAME_LENGTH
|
> Document.MAX_STORED_FILENAME_LENGTH
|
||||||
):
|
):
|
||||||
self.log.warning(
|
self.log.warning(
|
||||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||||
)
|
)
|
||||||
generated_archive_filename = generate_filename(
|
generated_filename = generate_filename(
|
||||||
document,
|
document,
|
||||||
archive_filename=True,
|
|
||||||
use_format=False,
|
use_format=False,
|
||||||
)
|
)
|
||||||
document.archive_filename = generated_archive_filename
|
document.filename = generated_filename
|
||||||
create_source_path_directory(document.archive_path)
|
create_source_path_directory(document.source_path)
|
||||||
|
|
||||||
self._write(
|
self._write(
|
||||||
archive_path,
|
self.unmodified_original
|
||||||
document.archive_path,
|
if self.unmodified_original is not None
|
||||||
|
else self.working_copy,
|
||||||
|
document.source_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
with Path(archive_path).open("rb") as f:
|
self._write(
|
||||||
document.archive_checksum = hashlib.md5(
|
thumbnail,
|
||||||
f.read(),
|
document.thumbnail_path,
|
||||||
).hexdigest()
|
)
|
||||||
|
|
||||||
# Don't save with the lock active. Saving will cause the file
|
if archive_path and Path(archive_path).is_file():
|
||||||
# renaming logic to acquire the lock as well.
|
generated_archive_filename = generate_unique_filename(
|
||||||
# This triggers things like file renaming
|
document,
|
||||||
document.save()
|
archive_filename=True,
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
len(str(generated_archive_filename))
|
||||||
|
> Document.MAX_STORED_FILENAME_LENGTH
|
||||||
|
):
|
||||||
|
self.log.warning(
|
||||||
|
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||||
|
)
|
||||||
|
generated_archive_filename = generate_filename(
|
||||||
|
document,
|
||||||
|
archive_filename=True,
|
||||||
|
use_format=False,
|
||||||
|
)
|
||||||
|
document.archive_filename = generated_archive_filename
|
||||||
|
create_source_path_directory(document.archive_path)
|
||||||
|
self._write(
|
||||||
|
archive_path,
|
||||||
|
document.archive_path,
|
||||||
|
)
|
||||||
|
|
||||||
if document.root_document_id:
|
with Path(archive_path).open("rb") as f:
|
||||||
document_updated.send(
|
document.archive_checksum = hashlib.md5(
|
||||||
sender=self.__class__,
|
f.read(),
|
||||||
document=document.root_document,
|
).hexdigest()
|
||||||
)
|
|
||||||
|
|
||||||
# Delete the file only if it was successfully consumed
|
# Don't save with the lock active. Saving will cause the file
|
||||||
self.log.debug(f"Deleting original file {self.input_doc.original_file}")
|
# renaming logic to acquire the lock as well.
|
||||||
self.input_doc.original_file.unlink()
|
# This triggers things like file renaming
|
||||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
document.save()
|
||||||
self.working_copy.unlink()
|
|
||||||
if self.unmodified_original is not None: # pragma: no cover
|
if document.root_document_id:
|
||||||
|
document_updated.send(
|
||||||
|
sender=self.__class__,
|
||||||
|
document=document.root_document,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Delete the file only if it was successfully consumed
|
||||||
self.log.debug(
|
self.log.debug(
|
||||||
f"Deleting unmodified original file {self.unmodified_original}",
|
f"Deleting original file {self.input_doc.original_file}",
|
||||||
)
|
)
|
||||||
self.unmodified_original.unlink()
|
self.input_doc.original_file.unlink()
|
||||||
|
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||||
|
self.working_copy.unlink()
|
||||||
|
if self.unmodified_original is not None: # pragma: no cover
|
||||||
|
self.log.debug(
|
||||||
|
f"Deleting unmodified original file {self.unmodified_original}",
|
||||||
|
)
|
||||||
|
self.unmodified_original.unlink()
|
||||||
|
|
||||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||||
shadow_file = (
|
shadow_file = (
|
||||||
Path(self.input_doc.original_file).parent
|
Path(self.input_doc.original_file).parent
|
||||||
/ f"._{Path(self.input_doc.original_file).name}"
|
/ f"._{Path(self.input_doc.original_file).name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if Path(shadow_file).is_file():
|
||||||
|
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||||
|
Path(shadow_file).unlink()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self._fail(
|
||||||
|
str(e),
|
||||||
|
f"The following error occurred while storing document "
|
||||||
|
f"{self.filename} after parsing: {e}",
|
||||||
|
exc_info=True,
|
||||||
|
exception=e,
|
||||||
)
|
)
|
||||||
|
finally:
|
||||||
if Path(shadow_file).is_file():
|
tempdir.cleanup()
|
||||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
|
||||||
Path(shadow_file).unlink()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self._fail(
|
|
||||||
str(e),
|
|
||||||
f"The following error occurred while storing document "
|
|
||||||
f"{self.filename} after parsing: {e}",
|
|
||||||
exc_info=True,
|
|
||||||
exception=e,
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
_parser_cleanup(document_parser)
|
|
||||||
tempdir.cleanup()
|
|
||||||
|
|
||||||
self.run_post_consume_script(document)
|
self.run_post_consume_script(document)
|
||||||
|
|
||||||
|
|||||||
@@ -3,19 +3,18 @@ import shutil
|
|||||||
|
|
||||||
from documents.management.commands.base import PaperlessCommand
|
from documents.management.commands.base import PaperlessCommand
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
from paperless.parsers.registry import get_parser_registry
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
|
||||||
from paperless.parsers.remote import RemoteDocumentParser
|
|
||||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
|
||||||
from paperless.parsers.text import TextDocumentParser
|
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.management.thumbnails")
|
logger = logging.getLogger("paperless.management.thumbnails")
|
||||||
|
|
||||||
|
|
||||||
def _process_document(doc_id: int) -> None:
|
def _process_document(doc_id: int) -> None:
|
||||||
document: Document = Document.objects.get(id=doc_id)
|
document: Document = Document.objects.get(id=doc_id)
|
||||||
parser_class = get_parser_class_for_mime_type(document.mime_type)
|
parser_class = get_parser_registry().get_parser_for_file(
|
||||||
|
document.mime_type,
|
||||||
|
document.original_filename or "",
|
||||||
|
document.source_path,
|
||||||
|
)
|
||||||
|
|
||||||
if parser_class is None:
|
if parser_class is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@@ -25,40 +24,9 @@ def _process_document(doc_id: int) -> None:
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
parser = parser_class(logging_group=None)
|
with parser_class() as parser:
|
||||||
|
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||||
parser_is_new_style = isinstance(
|
|
||||||
parser,
|
|
||||||
(
|
|
||||||
MailDocumentParser,
|
|
||||||
RasterisedDocumentParser,
|
|
||||||
RemoteDocumentParser,
|
|
||||||
TextDocumentParser,
|
|
||||||
TikaDocumentParser,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
|
||||||
if parser_is_new_style:
|
|
||||||
parser.__enter__()
|
|
||||||
|
|
||||||
try:
|
|
||||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
|
||||||
if parser_is_new_style:
|
|
||||||
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
|
||||||
else:
|
|
||||||
thumb = parser.get_thumbnail(
|
|
||||||
document.source_path,
|
|
||||||
document.mime_type,
|
|
||||||
document.get_public_filename(),
|
|
||||||
)
|
|
||||||
shutil.move(thumb, document.thumbnail_path)
|
shutil.move(thumb, document.thumbnail_path)
|
||||||
finally:
|
|
||||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
|
||||||
if parser_is_new_style:
|
|
||||||
parser.__exit__(None, None, None)
|
|
||||||
else:
|
|
||||||
parser.cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
class Command(PaperlessCommand):
|
class Command(PaperlessCommand):
|
||||||
|
|||||||
@@ -3,84 +3,47 @@ from __future__ import annotations
|
|||||||
import logging
|
import logging
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from functools import lru_cache
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from documents.loggers import LoggingMixin
|
from documents.loggers import LoggingMixin
|
||||||
from documents.signals import document_consumer_declaration
|
|
||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
# This regular expression will try to find dates in the document at
|
|
||||||
# hand and will match the following formats:
|
|
||||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
|
||||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
|
||||||
# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
|
|
||||||
# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits
|
|
||||||
|
|
||||||
# TODO: isn't there a date parsing library for this?
|
|
||||||
|
|
||||||
DATE_REGEX = re.compile(
|
|
||||||
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.parsing")
|
logger = logging.getLogger("paperless.parsing")
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
|
||||||
def is_mime_type_supported(mime_type: str) -> bool:
|
def is_mime_type_supported(mime_type: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Returns True if the mime type is supported, False otherwise
|
Returns True if the mime type is supported, False otherwise
|
||||||
"""
|
"""
|
||||||
return get_parser_class_for_mime_type(mime_type) is not None
|
return get_parser_registry().get_parser_for_file(mime_type, "") is not None
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
|
||||||
def get_default_file_extension(mime_type: str) -> str:
|
def get_default_file_extension(mime_type: str) -> str:
|
||||||
"""
|
"""
|
||||||
Returns the default file extension for a mimetype, or
|
Returns the default file extension for a mimetype, or
|
||||||
an empty string if it could not be determined
|
an empty string if it could not be determined
|
||||||
"""
|
"""
|
||||||
for response in document_consumer_declaration.send(None):
|
parser_class = get_parser_registry().get_parser_for_file(mime_type, "")
|
||||||
parser_declaration = response[1]
|
if parser_class is not None:
|
||||||
supported_mime_types = parser_declaration["mime_types"]
|
supported = parser_class.supported_mime_types()
|
||||||
|
if mime_type in supported:
|
||||||
if mime_type in supported_mime_types:
|
return supported[mime_type]
|
||||||
return supported_mime_types[mime_type]
|
|
||||||
|
|
||||||
ext = mimetypes.guess_extension(mime_type)
|
ext = mimetypes.guess_extension(mime_type)
|
||||||
if ext:
|
return ext if ext else ""
|
||||||
return ext
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
|
||||||
def is_file_ext_supported(ext: str) -> bool:
|
def is_file_ext_supported(ext: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Returns True if the file extension is supported, False otherwise
|
Returns True if the file extension is supported, False otherwise
|
||||||
@@ -94,44 +57,17 @@ def is_file_ext_supported(ext: str) -> bool:
|
|||||||
|
|
||||||
def get_supported_file_extensions() -> set[str]:
|
def get_supported_file_extensions() -> set[str]:
|
||||||
extensions = set()
|
extensions = set()
|
||||||
for response in document_consumer_declaration.send(None):
|
for parser_class in get_parser_registry().all_parsers():
|
||||||
parser_declaration = response[1]
|
for mime_type, ext in parser_class.supported_mime_types().items():
|
||||||
supported_mime_types = parser_declaration["mime_types"]
|
|
||||||
|
|
||||||
for mime_type in supported_mime_types:
|
|
||||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||||
# Python's stdlib might be behind, so also add what the parser
|
# Python's stdlib might be behind, so also add what the parser
|
||||||
# says is the default extension
|
# says is the default extension
|
||||||
# This makes image/webp supported on Python < 3.11
|
# This makes image/webp supported on Python < 3.11
|
||||||
extensions.add(supported_mime_types[mime_type])
|
extensions.add(ext)
|
||||||
|
|
||||||
return extensions
|
return extensions
|
||||||
|
|
||||||
|
|
||||||
def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
|
|
||||||
"""
|
|
||||||
Returns the best parser (by weight) for the given mimetype or
|
|
||||||
None if no parser exists
|
|
||||||
"""
|
|
||||||
|
|
||||||
options = []
|
|
||||||
|
|
||||||
for response in document_consumer_declaration.send(None):
|
|
||||||
parser_declaration = response[1]
|
|
||||||
supported_mime_types = parser_declaration["mime_types"]
|
|
||||||
|
|
||||||
if mime_type in supported_mime_types:
|
|
||||||
options.append(parser_declaration)
|
|
||||||
|
|
||||||
if not options:
|
|
||||||
return None
|
|
||||||
|
|
||||||
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
|
|
||||||
|
|
||||||
# Return the parser with the highest weight.
|
|
||||||
return best_parser["parser"]
|
|
||||||
|
|
||||||
|
|
||||||
def run_convert(
|
def run_convert(
|
||||||
input_file,
|
input_file,
|
||||||
output_file,
|
output_file,
|
||||||
|
|||||||
@@ -2,5 +2,4 @@ from django.dispatch import Signal
|
|||||||
|
|
||||||
document_consumption_started = Signal()
|
document_consumption_started = Signal()
|
||||||
document_consumption_finished = Signal()
|
document_consumption_finished = Signal()
|
||||||
document_consumer_declaration = Signal()
|
|
||||||
document_updated = Signal()
|
document_updated = Signal()
|
||||||
|
|||||||
@@ -52,8 +52,6 @@ from documents.models import StoragePath
|
|||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.models import WorkflowRun
|
from documents.models import WorkflowRun
|
||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
|
||||||
from documents.plugins.base import ConsumeTaskPlugin
|
from documents.plugins.base import ConsumeTaskPlugin
|
||||||
from documents.plugins.base import ProgressManager
|
from documents.plugins.base import ProgressManager
|
||||||
from documents.plugins.base import StopConsumeTaskError
|
from documents.plugins.base import StopConsumeTaskError
|
||||||
@@ -66,11 +64,7 @@ from documents.signals.handlers import send_websocket_document_updated
|
|||||||
from documents.workflows.utils import get_workflows_for_trigger
|
from documents.workflows.utils import get_workflows_for_trigger
|
||||||
from paperless.config import AIConfig
|
from paperless.config import AIConfig
|
||||||
from paperless.parsers import ParserContext
|
from paperless.parsers import ParserContext
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless.parsers.registry import get_parser_registry
|
||||||
from paperless.parsers.remote import RemoteDocumentParser
|
|
||||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
|
||||||
from paperless.parsers.text import TextDocumentParser
|
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
|
||||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||||
from paperless_ai.indexing import llm_index_remove_document
|
from paperless_ai.indexing import llm_index_remove_document
|
||||||
from paperless_ai.indexing import update_llm_index
|
from paperless_ai.indexing import update_llm_index
|
||||||
@@ -310,8 +304,10 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
|||||||
|
|
||||||
mime_type = document.mime_type
|
mime_type = document.mime_type
|
||||||
|
|
||||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
parser_class = get_parser_registry().get_parser_for_file(
|
||||||
mime_type,
|
mime_type,
|
||||||
|
document.original_filename or "",
|
||||||
|
document.source_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not parser_class:
|
if not parser_class:
|
||||||
@@ -321,138 +317,92 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
|
with parser_class() as parser:
|
||||||
|
parser.configure(ParserContext())
|
||||||
|
|
||||||
parser_is_new_style = isinstance(
|
try:
|
||||||
parser,
|
|
||||||
(
|
|
||||||
MailDocumentParser,
|
|
||||||
RasterisedDocumentParser,
|
|
||||||
RemoteDocumentParser,
|
|
||||||
TextDocumentParser,
|
|
||||||
TikaDocumentParser,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
|
||||||
if parser_is_new_style:
|
|
||||||
parser.__enter__()
|
|
||||||
|
|
||||||
try:
|
|
||||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
|
||||||
if parser_is_new_style:
|
|
||||||
parser.configure(ParserContext())
|
|
||||||
parser.parse(document.source_path, mime_type)
|
parser.parse(document.source_path, mime_type)
|
||||||
else:
|
|
||||||
parser.parse(
|
|
||||||
document.source_path,
|
|
||||||
mime_type,
|
|
||||||
document.get_public_filename(),
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
|
||||||
if parser_is_new_style:
|
|
||||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||||
else:
|
|
||||||
thumbnail = parser.get_thumbnail(
|
|
||||||
document.source_path,
|
|
||||||
mime_type,
|
|
||||||
document.get_public_filename(),
|
|
||||||
)
|
|
||||||
|
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
oldDocument = Document.objects.get(pk=document.pk)
|
oldDocument = Document.objects.get(pk=document.pk)
|
||||||
if parser.get_archive_path():
|
|
||||||
with Path(parser.get_archive_path()).open("rb") as f:
|
|
||||||
checksum = hashlib.md5(f.read()).hexdigest()
|
|
||||||
# I'm going to save first so that in case the file move
|
|
||||||
# fails, the database is rolled back.
|
|
||||||
# We also don't use save() since that triggers the filehandling
|
|
||||||
# logic, and we don't want that yet (file not yet in place)
|
|
||||||
document.archive_filename = generate_unique_filename(
|
|
||||||
document,
|
|
||||||
archive_filename=True,
|
|
||||||
)
|
|
||||||
Document.objects.filter(pk=document.pk).update(
|
|
||||||
archive_checksum=checksum,
|
|
||||||
content=parser.get_text(),
|
|
||||||
archive_filename=document.archive_filename,
|
|
||||||
)
|
|
||||||
newDocument = Document.objects.get(pk=document.pk)
|
|
||||||
if settings.AUDIT_LOG_ENABLED:
|
|
||||||
LogEntry.objects.log_create(
|
|
||||||
instance=oldDocument,
|
|
||||||
changes={
|
|
||||||
"content": [oldDocument.content, newDocument.content],
|
|
||||||
"archive_checksum": [
|
|
||||||
oldDocument.archive_checksum,
|
|
||||||
newDocument.archive_checksum,
|
|
||||||
],
|
|
||||||
"archive_filename": [
|
|
||||||
oldDocument.archive_filename,
|
|
||||||
newDocument.archive_filename,
|
|
||||||
],
|
|
||||||
},
|
|
||||||
additional_data={
|
|
||||||
"reason": "Update document content",
|
|
||||||
},
|
|
||||||
action=LogEntry.Action.UPDATE,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
Document.objects.filter(pk=document.pk).update(
|
|
||||||
content=parser.get_text(),
|
|
||||||
)
|
|
||||||
|
|
||||||
if settings.AUDIT_LOG_ENABLED:
|
|
||||||
LogEntry.objects.log_create(
|
|
||||||
instance=oldDocument,
|
|
||||||
changes={
|
|
||||||
"content": [oldDocument.content, parser.get_text()],
|
|
||||||
},
|
|
||||||
additional_data={
|
|
||||||
"reason": "Update document content",
|
|
||||||
},
|
|
||||||
action=LogEntry.Action.UPDATE,
|
|
||||||
)
|
|
||||||
|
|
||||||
with FileLock(settings.MEDIA_LOCK):
|
|
||||||
if parser.get_archive_path():
|
if parser.get_archive_path():
|
||||||
create_source_path_directory(document.archive_path)
|
with Path(parser.get_archive_path()).open("rb") as f:
|
||||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
checksum = hashlib.md5(f.read()).hexdigest()
|
||||||
shutil.move(thumbnail, document.thumbnail_path)
|
# I'm going to save first so that in case the file move
|
||||||
|
# fails, the database is rolled back.
|
||||||
|
# We also don't use save() since that triggers the filehandling
|
||||||
|
# logic, and we don't want that yet (file not yet in place)
|
||||||
|
document.archive_filename = generate_unique_filename(
|
||||||
|
document,
|
||||||
|
archive_filename=True,
|
||||||
|
)
|
||||||
|
Document.objects.filter(pk=document.pk).update(
|
||||||
|
archive_checksum=checksum,
|
||||||
|
content=parser.get_text(),
|
||||||
|
archive_filename=document.archive_filename,
|
||||||
|
)
|
||||||
|
newDocument = Document.objects.get(pk=document.pk)
|
||||||
|
if settings.AUDIT_LOG_ENABLED:
|
||||||
|
LogEntry.objects.log_create(
|
||||||
|
instance=oldDocument,
|
||||||
|
changes={
|
||||||
|
"content": [oldDocument.content, newDocument.content],
|
||||||
|
"archive_checksum": [
|
||||||
|
oldDocument.archive_checksum,
|
||||||
|
newDocument.archive_checksum,
|
||||||
|
],
|
||||||
|
"archive_filename": [
|
||||||
|
oldDocument.archive_filename,
|
||||||
|
newDocument.archive_filename,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
additional_data={
|
||||||
|
"reason": "Update document content",
|
||||||
|
},
|
||||||
|
action=LogEntry.Action.UPDATE,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
Document.objects.filter(pk=document.pk).update(
|
||||||
|
content=parser.get_text(),
|
||||||
|
)
|
||||||
|
|
||||||
document.refresh_from_db()
|
if settings.AUDIT_LOG_ENABLED:
|
||||||
logger.info(
|
LogEntry.objects.log_create(
|
||||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
instance=oldDocument,
|
||||||
)
|
changes={
|
||||||
with index.open_index_writer() as writer:
|
"content": [oldDocument.content, parser.get_text()],
|
||||||
index.update_document(writer, document)
|
},
|
||||||
|
additional_data={
|
||||||
|
"reason": "Update document content",
|
||||||
|
},
|
||||||
|
action=LogEntry.Action.UPDATE,
|
||||||
|
)
|
||||||
|
|
||||||
ai_config = AIConfig()
|
with FileLock(settings.MEDIA_LOCK):
|
||||||
if ai_config.llm_index_enabled:
|
if parser.get_archive_path():
|
||||||
llm_index_add_or_update_document(document)
|
create_source_path_directory(document.archive_path)
|
||||||
|
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||||
|
shutil.move(thumbnail, document.thumbnail_path)
|
||||||
|
|
||||||
clear_document_caches(document.pk)
|
document.refresh_from_db()
|
||||||
|
logger.info(
|
||||||
|
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||||
|
)
|
||||||
|
with index.open_index_writer() as writer:
|
||||||
|
index.update_document(writer, document)
|
||||||
|
|
||||||
except Exception:
|
ai_config = AIConfig()
|
||||||
logger.exception(
|
if ai_config.llm_index_enabled:
|
||||||
f"Error while parsing document {document} (ID: {document_id})",
|
llm_index_add_or_update_document(document)
|
||||||
)
|
|
||||||
finally:
|
clear_document_caches(document.pk)
|
||||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
|
||||||
if isinstance(
|
except Exception:
|
||||||
parser,
|
logger.exception(
|
||||||
(
|
f"Error while parsing document {document} (ID: {document_id})",
|
||||||
MailDocumentParser,
|
)
|
||||||
RasterisedDocumentParser,
|
|
||||||
RemoteDocumentParser,
|
|
||||||
TextDocumentParser,
|
|
||||||
TikaDocumentParser,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
parser.__exit__(None, None, None)
|
|
||||||
else:
|
|
||||||
parser.cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
|
|||||||
@@ -13,8 +13,10 @@ class TestDocumentChecks(TestCase):
|
|||||||
def test_parser_check(self) -> None:
|
def test_parser_check(self) -> None:
|
||||||
self.assertEqual(parser_check(None), [])
|
self.assertEqual(parser_check(None), [])
|
||||||
|
|
||||||
with mock.patch("documents.checks.document_consumer_declaration.send") as m:
|
with mock.patch("documents.checks.get_parser_registry") as mock_registry_fn:
|
||||||
m.return_value = []
|
mock_registry = mock.MagicMock()
|
||||||
|
mock_registry.all_parsers.return_value = []
|
||||||
|
mock_registry_fn.return_value = mock_registry
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
parser_check(None),
|
parser_check(None),
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ from documents.models import Document
|
|||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from documents.plugins.helpers import ProgressStatusOptions
|
from documents.plugins.helpers import ProgressStatusOptions
|
||||||
from documents.tasks import sanity_check
|
from documents.tasks import sanity_check
|
||||||
@@ -38,62 +37,106 @@ from documents.tests.utils import GetConsumerMixin
|
|||||||
from paperless_mail.models import MailRule
|
from paperless_mail.models import MailRule
|
||||||
|
|
||||||
|
|
||||||
class _BaseTestParser(DocumentParser):
|
class _BaseNewStyleParser:
|
||||||
def get_settings(self) -> None:
|
"""Minimal ParserProtocol implementation for use in consumer tests."""
|
||||||
|
|
||||||
|
name: str = "test-parser"
|
||||||
|
version: str = "0.1"
|
||||||
|
author: str = "test"
|
||||||
|
url: str = "test"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict:
|
||||||
|
return {
|
||||||
|
"application/pdf": ".pdf",
|
||||||
|
"image/png": ".png",
|
||||||
|
"message/rfc822": ".eml",
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type: str, filename: str, path=None):
|
||||||
|
return 0 if mime_type in cls.supported_mime_types() else None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._tmpdir: Path | None = None
|
||||||
|
self._text: str | None = None
|
||||||
|
self._archive: Path | None = None
|
||||||
|
self._thumb: Path | None = None
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self._tmpdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-test-", dir=settings.SCRATCH_DIR),
|
||||||
|
)
|
||||||
|
_, thumb = tempfile.mkstemp(suffix=".webp", dir=self._tmpdir)
|
||||||
|
self._thumb = Path(thumb)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||||
|
if self._tmpdir and self._tmpdir.exists():
|
||||||
|
shutil.rmtree(self._tmpdir, ignore_errors=True)
|
||||||
|
|
||||||
|
def configure(self, context) -> None:
|
||||||
"""
|
"""
|
||||||
This parser does not implement additional settings yet
|
Test parser doesn't do anything with context
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self):
|
||||||
|
return self._archive
|
||||||
|
|
||||||
class DummyParser(_BaseTestParser):
|
def get_thumbnail(self, document_path, mime_type) -> Path:
|
||||||
def __init__(self, logging_group, scratch_dir, archive_path) -> None:
|
return self._thumb
|
||||||
super().__init__(logging_group, None)
|
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
|
||||||
self.archive_path = archive_path
|
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
def get_page_count(self, document_path, mime_type):
|
||||||
return self.fake_thumb
|
return None
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
def extract_metadata(self, document_path, mime_type) -> list:
|
||||||
self.text = "The Text"
|
return []
|
||||||
|
|
||||||
|
|
||||||
class CopyParser(_BaseTestParser):
|
class DummyParser(_BaseNewStyleParser):
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
_ARCHIVE_SRC = (
|
||||||
return self.fake_thumb
|
Path(__file__).parent / "samples" / "documents" / "archive" / "0000001.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, logging_group, progress_callback=None) -> None:
|
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||||
super().__init__(logging_group, progress_callback)
|
self._text = "The Text"
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir)
|
if produce_archive and self._tmpdir:
|
||||||
|
self._archive = self._tmpdir / "archive.pdf"
|
||||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
shutil.copy(self._ARCHIVE_SRC, self._archive)
|
||||||
self.text = "The text"
|
|
||||||
self.archive_path = Path(self.tempdir / "archive.pdf")
|
|
||||||
shutil.copy(document_path, self.archive_path)
|
|
||||||
|
|
||||||
|
|
||||||
class FaultyParser(_BaseTestParser):
|
class CopyParser(_BaseNewStyleParser):
|
||||||
def __init__(self, logging_group, scratch_dir) -> None:
|
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||||
super().__init__(logging_group)
|
self._text = "The text"
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
if produce_archive and self._tmpdir:
|
||||||
|
self._archive = self._tmpdir / "archive.pdf"
|
||||||
|
shutil.copy(document_path, self._archive)
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
|
||||||
return self.fake_thumb
|
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None):
|
class FaultyParser(_BaseNewStyleParser):
|
||||||
|
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||||
raise ParseError("Does not compute.")
|
raise ParseError("Does not compute.")
|
||||||
|
|
||||||
|
|
||||||
class FaultyGenericExceptionParser(_BaseTestParser):
|
class FaultyGenericExceptionParser(_BaseNewStyleParser):
|
||||||
def __init__(self, logging_group, scratch_dir) -> None:
|
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||||
super().__init__(logging_group)
|
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
|
||||||
return self.fake_thumb
|
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None):
|
|
||||||
raise Exception("Generic exception.")
|
raise Exception("Generic exception.")
|
||||||
|
|
||||||
|
|
||||||
@@ -147,38 +190,12 @@ class TestConsumer(
|
|||||||
self.assertEqual(payload["data"]["max_progress"], last_progress_max)
|
self.assertEqual(payload["data"]["max_progress"], last_progress_max)
|
||||||
self.assertEqual(payload["data"]["status"], last_status)
|
self.assertEqual(payload["data"]["status"], last_status)
|
||||||
|
|
||||||
def make_dummy_parser(self, logging_group, progress_callback=None):
|
|
||||||
return DummyParser(
|
|
||||||
logging_group,
|
|
||||||
self.dirs.scratch_dir,
|
|
||||||
self.get_test_archive_file(),
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_faulty_parser(self, logging_group, progress_callback=None):
|
|
||||||
return FaultyParser(logging_group, self.dirs.scratch_dir)
|
|
||||||
|
|
||||||
def make_faulty_generic_exception_parser(
|
|
||||||
self,
|
|
||||||
logging_group,
|
|
||||||
progress_callback=None,
|
|
||||||
):
|
|
||||||
return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir)
|
|
||||||
|
|
||||||
def setUp(self) -> None:
|
def setUp(self) -> None:
|
||||||
super().setUp()
|
super().setUp()
|
||||||
|
|
||||||
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
patcher = mock.patch("documents.consumer.get_parser_registry")
|
||||||
m = patcher.start()
|
mock_registry = patcher.start()
|
||||||
m.return_value = [
|
mock_registry.return_value.get_parser_for_file.return_value = DummyParser
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"parser": self.make_dummy_parser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
"weight": 0,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
self.addCleanup(patcher.stop)
|
self.addCleanup(patcher.stop)
|
||||||
|
|
||||||
def get_test_file(self):
|
def get_test_file(self):
|
||||||
@@ -547,9 +564,9 @@ class TestConsumer(
|
|||||||
) as consumer:
|
) as consumer:
|
||||||
consumer.run()
|
consumer.run()
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def testNoParsers(self, m) -> None:
|
def testNoParsers(self, m) -> None:
|
||||||
m.return_value = []
|
m.return_value.get_parser_for_file.return_value = None
|
||||||
|
|
||||||
with self.assertRaisesMessage(
|
with self.assertRaisesMessage(
|
||||||
ConsumerError,
|
ConsumerError,
|
||||||
@@ -560,18 +577,9 @@ class TestConsumer(
|
|||||||
|
|
||||||
self._assert_first_last_send_progress(last_status="FAILED")
|
self._assert_first_last_send_progress(last_status="FAILED")
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def testFaultyParser(self, m) -> None:
|
def testFaultyParser(self, m) -> None:
|
||||||
m.return_value = [
|
m.return_value.get_parser_for_file.return_value = FaultyParser
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"parser": self.make_faulty_parser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
"weight": 0,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
with self.get_consumer(self.get_test_file()) as consumer:
|
with self.get_consumer(self.get_test_file()) as consumer:
|
||||||
with self.assertRaisesMessage(
|
with self.assertRaisesMessage(
|
||||||
@@ -582,18 +590,9 @@ class TestConsumer(
|
|||||||
|
|
||||||
self._assert_first_last_send_progress(last_status="FAILED")
|
self._assert_first_last_send_progress(last_status="FAILED")
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def testGenericParserException(self, m) -> None:
|
def testGenericParserException(self, m) -> None:
|
||||||
m.return_value = [
|
m.return_value.get_parser_for_file.return_value = FaultyGenericExceptionParser
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"parser": self.make_faulty_generic_exception_parser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
"weight": 0,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
with self.get_consumer(self.get_test_file()) as consumer:
|
with self.get_consumer(self.get_test_file()) as consumer:
|
||||||
with self.assertRaisesMessage(
|
with self.assertRaisesMessage(
|
||||||
@@ -1017,7 +1016,7 @@ class TestConsumer(
|
|||||||
self._assert_first_last_send_progress()
|
self._assert_first_last_send_progress()
|
||||||
|
|
||||||
@override_settings(FILENAME_FORMAT="{title}")
|
@override_settings(FILENAME_FORMAT="{title}")
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def test_similar_filenames(self, m) -> None:
|
def test_similar_filenames(self, m) -> None:
|
||||||
shutil.copy(
|
shutil.copy(
|
||||||
Path(__file__).parent / "samples" / "simple.pdf",
|
Path(__file__).parent / "samples" / "simple.pdf",
|
||||||
@@ -1031,16 +1030,7 @@ class TestConsumer(
|
|||||||
Path(__file__).parent / "samples" / "simple-noalpha.png",
|
Path(__file__).parent / "samples" / "simple-noalpha.png",
|
||||||
settings.CONSUMPTION_DIR / "simple.png.pdf",
|
settings.CONSUMPTION_DIR / "simple.png.pdf",
|
||||||
)
|
)
|
||||||
m.return_value = [
|
m.return_value.get_parser_for_file.return_value = CopyParser
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"parser": CopyParser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
|
|
||||||
"weight": 0,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer:
|
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer:
|
||||||
consumer.run()
|
consumer.run()
|
||||||
@@ -1068,8 +1058,10 @@ class TestConsumer(
|
|||||||
|
|
||||||
sanity_check()
|
sanity_check()
|
||||||
|
|
||||||
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
@mock.patch("documents.consumer.run_subprocess")
|
@mock.patch("documents.consumer.run_subprocess")
|
||||||
def test_try_to_clean_invalid_pdf(self, m) -> None:
|
def test_try_to_clean_invalid_pdf(self, m, mock_registry) -> None:
|
||||||
|
mock_registry.return_value.get_parser_for_file.return_value = None
|
||||||
shutil.copy(
|
shutil.copy(
|
||||||
Path(__file__).parent / "samples" / "invalid_pdf.pdf",
|
Path(__file__).parent / "samples" / "invalid_pdf.pdf",
|
||||||
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
|
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
|
||||||
@@ -1091,10 +1083,10 @@ class TestConsumer(
|
|||||||
|
|
||||||
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
||||||
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
|
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def test_mail_parser_receives_mailrule(
|
def test_mail_parser_receives_mailrule(
|
||||||
self,
|
self,
|
||||||
mock_consumer_declaration_send: mock.Mock,
|
mock_get_parser_registry: mock.Mock,
|
||||||
mock_mail_parser_parse: mock.Mock,
|
mock_mail_parser_parse: mock.Mock,
|
||||||
mock_mailrule_get: mock.Mock,
|
mock_mailrule_get: mock.Mock,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -1106,18 +1098,11 @@ class TestConsumer(
|
|||||||
THEN:
|
THEN:
|
||||||
- The mail parser should receive the mail rule
|
- The mail parser should receive the mail rule
|
||||||
"""
|
"""
|
||||||
from paperless_mail.signals import get_parser as mail_get_parser
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
|
|
||||||
mock_consumer_declaration_send.return_value = [
|
mock_get_parser_registry.return_value.get_parser_for_file.return_value = (
|
||||||
(
|
MailDocumentParser
|
||||||
None,
|
)
|
||||||
{
|
|
||||||
"parser": mail_get_parser,
|
|
||||||
"mime_types": {"message/rfc822": ".eml"},
|
|
||||||
"weight": 0,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
mock_mailrule_get.return_value = mock.Mock(
|
mock_mailrule_get.return_value = mock.Mock(
|
||||||
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
|
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,132 +1,16 @@
|
|||||||
from tempfile import TemporaryDirectory
|
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
from django.apps import apps
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from django.test import override_settings
|
from django.test import override_settings
|
||||||
|
|
||||||
from documents.parsers import get_default_file_extension
|
from documents.parsers import get_default_file_extension
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
|
||||||
from documents.parsers import get_supported_file_extensions
|
from documents.parsers import get_supported_file_extensions
|
||||||
from documents.parsers import is_file_ext_supported
|
from documents.parsers import is_file_ext_supported
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
from paperless.parsers.registry import reset_parser_registry
|
||||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
|
|
||||||
class TestParserDiscovery(TestCase):
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
||||||
def test_get_parser_class_1_parser(self, m, *args) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Parser declared for a given mimetype
|
|
||||||
WHEN:
|
|
||||||
- Attempt to get parser for the mimetype
|
|
||||||
THEN:
|
|
||||||
- Declared parser class is returned
|
|
||||||
"""
|
|
||||||
|
|
||||||
class DummyParser:
|
|
||||||
pass
|
|
||||||
|
|
||||||
m.return_value = (
|
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"weight": 0,
|
|
||||||
"parser": DummyParser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
|
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
||||||
def test_get_parser_class_n_parsers(self, m, *args) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Two parsers declared for a given mimetype
|
|
||||||
- Second parser has a higher weight
|
|
||||||
WHEN:
|
|
||||||
- Attempt to get parser for the mimetype
|
|
||||||
THEN:
|
|
||||||
- Second parser class is returned
|
|
||||||
"""
|
|
||||||
|
|
||||||
class DummyParser1:
|
|
||||||
pass
|
|
||||||
|
|
||||||
class DummyParser2:
|
|
||||||
pass
|
|
||||||
|
|
||||||
m.return_value = (
|
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"weight": 0,
|
|
||||||
"parser": DummyParser1,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"weight": 1,
|
|
||||||
"parser": DummyParser2,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
get_parser_class_for_mime_type("application/pdf"),
|
|
||||||
DummyParser2,
|
|
||||||
)
|
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
||||||
def test_get_parser_class_0_parsers(self, m, *args) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- No parsers are declared
|
|
||||||
WHEN:
|
|
||||||
- Attempt to get parser for the mimetype
|
|
||||||
THEN:
|
|
||||||
- No parser class is returned
|
|
||||||
"""
|
|
||||||
m.return_value = []
|
|
||||||
with TemporaryDirectory():
|
|
||||||
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
|
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
||||||
def test_get_parser_class_no_valid_parser(self, m, *args) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- No parser declared for a given mimetype
|
|
||||||
- Parser declared for a different mimetype
|
|
||||||
WHEN:
|
|
||||||
- Attempt to get parser for the given mimetype
|
|
||||||
THEN:
|
|
||||||
- No parser class is returned
|
|
||||||
"""
|
|
||||||
|
|
||||||
class DummyParser:
|
|
||||||
pass
|
|
||||||
|
|
||||||
m.return_value = (
|
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"weight": 0,
|
|
||||||
"parser": DummyParser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
|
|
||||||
|
|
||||||
|
|
||||||
class TestParserAvailability(TestCase):
|
class TestParserAvailability(TestCase):
|
||||||
def test_tesseract_parser(self) -> None:
|
def test_tesseract_parser(self) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -151,7 +35,7 @@ class TestParserAvailability(TestCase):
|
|||||||
self.assertIn(ext, supported_exts)
|
self.assertIn(ext, supported_exts)
|
||||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||||
RasterisedDocumentParser,
|
RasterisedDocumentParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -175,7 +59,7 @@ class TestParserAvailability(TestCase):
|
|||||||
self.assertIn(ext, supported_exts)
|
self.assertIn(ext, supported_exts)
|
||||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||||
TextDocumentParser,
|
TextDocumentParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -198,22 +82,23 @@ class TestParserAvailability(TestCase):
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Force the app ready to notice the settings override
|
self.addCleanup(reset_parser_registry)
|
||||||
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
|
|
||||||
app = apps.get_app_config("paperless_tika")
|
# Reset and rebuild the registry with Tika enabled.
|
||||||
app.ready()
|
with override_settings(TIKA_ENABLED=True):
|
||||||
|
reset_parser_registry()
|
||||||
supported_exts = get_supported_file_extensions()
|
supported_exts = get_supported_file_extensions()
|
||||||
|
|
||||||
for mime_type, ext in supported_mimes_and_exts:
|
for mime_type, ext in supported_mimes_and_exts:
|
||||||
self.assertIn(ext, supported_exts)
|
self.assertIn(ext, supported_exts)
|
||||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||||
TikaDocumentParser,
|
TikaDocumentParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_no_parser_for_mime(self) -> None:
|
def test_no_parser_for_mime(self) -> None:
|
||||||
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
self.assertIsNone(get_parser_registry().get_parser_for_file("text/sdgsdf", ""))
|
||||||
|
|
||||||
def test_default_extension(self) -> None:
|
def test_default_extension(self) -> None:
|
||||||
# Test no parser declared still returns a an extension
|
# Test no parser declared still returns a an extension
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ import tempfile
|
|||||||
import zipfile
|
import zipfile
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from contextlib import nullcontext
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import mktime
|
from time import mktime
|
||||||
@@ -158,7 +157,6 @@ from documents.models import UiSettings
|
|||||||
from documents.models import Workflow
|
from documents.models import Workflow
|
||||||
from documents.models import WorkflowAction
|
from documents.models import WorkflowAction
|
||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
|
||||||
from documents.permissions import AcknowledgeTasksPermissions
|
from documents.permissions import AcknowledgeTasksPermissions
|
||||||
from documents.permissions import PaperlessAdminPermissions
|
from documents.permissions import PaperlessAdminPermissions
|
||||||
from documents.permissions import PaperlessNotePermissions
|
from documents.permissions import PaperlessNotePermissions
|
||||||
@@ -226,7 +224,7 @@ from paperless.celery import app as celery_app
|
|||||||
from paperless.config import AIConfig
|
from paperless.config import AIConfig
|
||||||
from paperless.config import GeneralConfig
|
from paperless.config import GeneralConfig
|
||||||
from paperless.models import ApplicationConfiguration
|
from paperless.models import ApplicationConfiguration
|
||||||
from paperless.parsers import ParserProtocol
|
from paperless.parsers.registry import get_parser_registry
|
||||||
from paperless.serialisers import GroupSerializer
|
from paperless.serialisers import GroupSerializer
|
||||||
from paperless.serialisers import UserSerializer
|
from paperless.serialisers import UserSerializer
|
||||||
from paperless.views import StandardPagination
|
from paperless.views import StandardPagination
|
||||||
@@ -1083,17 +1081,17 @@ class DocumentViewSet(
|
|||||||
if not Path(file).is_file():
|
if not Path(file).is_file():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
parser_class = get_parser_registry().get_parser_for_file(
|
||||||
|
mime_type,
|
||||||
|
Path(file).name,
|
||||||
|
Path(file),
|
||||||
|
)
|
||||||
if parser_class:
|
if parser_class:
|
||||||
parser = parser_class(progress_callback=None, logging_group=None)
|
|
||||||
cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with cm:
|
with parser_class() as parser:
|
||||||
return parser.extract_metadata(file, mime_type)
|
return parser.extract_metadata(file, mime_type)
|
||||||
except Exception: # pragma: no cover
|
except Exception: # pragma: no cover
|
||||||
logger.exception(f"Issue getting metadata for {file}")
|
logger.exception(f"Issue getting metadata for {file}")
|
||||||
# TODO: cover GPG errors, remove later.
|
|
||||||
return []
|
return []
|
||||||
else: # pragma: no cover
|
else: # pragma: no cover
|
||||||
logger.warning(f"No parser for {mime_type}")
|
logger.warning(f"No parser for {mime_type}")
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import os
|
|||||||
import pwd
|
import pwd
|
||||||
import shutil
|
import shutil
|
||||||
import stat
|
import stat
|
||||||
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
@@ -299,3 +300,62 @@ def check_deprecated_db_settings(
|
|||||||
)
|
)
|
||||||
|
|
||||||
return warnings
|
return warnings
|
||||||
|
|
||||||
|
|
||||||
|
@register()
|
||||||
|
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
|
||||||
|
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||||
|
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||||
|
):
|
||||||
|
return [
|
||||||
|
Error(
|
||||||
|
"Azure AI remote parser requires endpoint and API key to be configured.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def get_tesseract_langs():
|
||||||
|
proc = subprocess.run(
|
||||||
|
[shutil.which("tesseract"), "--list-langs"],
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Decode bytes to string, split on newlines, trim out the header
|
||||||
|
proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
|
||||||
|
|
||||||
|
return [x.strip() for x in proc_lines]
|
||||||
|
|
||||||
|
|
||||||
|
@register()
|
||||||
|
def check_default_language_available(app_configs, **kwargs):
|
||||||
|
errs = []
|
||||||
|
|
||||||
|
if not settings.OCR_LANGUAGE:
|
||||||
|
errs.append(
|
||||||
|
Warning(
|
||||||
|
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
||||||
|
"This means that tesseract will fallback to english.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return errs
|
||||||
|
|
||||||
|
# binaries_check in paperless will check and report if this doesn't exist
|
||||||
|
# So skip trying to do anything here and let that handle missing binaries
|
||||||
|
if shutil.which("tesseract") is not None:
|
||||||
|
installed_langs = get_tesseract_langs()
|
||||||
|
|
||||||
|
specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")]
|
||||||
|
|
||||||
|
for lang in specified_langs:
|
||||||
|
if lang not in installed_langs:
|
||||||
|
errs.append(
|
||||||
|
Error(
|
||||||
|
f"The selected ocr language {lang} is "
|
||||||
|
f"not installed. Paperless cannot OCR your documents "
|
||||||
|
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
return errs
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ name, version, author, url, supported_mime_types (callable), score (callable).
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import threading
|
||||||
from importlib.metadata import entry_points
|
from importlib.metadata import entry_points
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
@@ -49,6 +50,7 @@ logger = logging.getLogger("paperless.parsers.registry")
|
|||||||
|
|
||||||
_registry: ParserRegistry | None = None
|
_registry: ParserRegistry | None = None
|
||||||
_discovery_complete: bool = False
|
_discovery_complete: bool = False
|
||||||
|
_lock = threading.Lock()
|
||||||
|
|
||||||
# Attribute names that every registered external parser class must expose.
|
# Attribute names that every registered external parser class must expose.
|
||||||
_REQUIRED_ATTRS: tuple[str, ...] = (
|
_REQUIRED_ATTRS: tuple[str, ...] = (
|
||||||
@@ -74,7 +76,6 @@ def get_parser_registry() -> ParserRegistry:
|
|||||||
1. Creates a new ParserRegistry.
|
1. Creates a new ParserRegistry.
|
||||||
2. Calls register_defaults to install built-in parsers.
|
2. Calls register_defaults to install built-in parsers.
|
||||||
3. Calls discover to load third-party plugins via importlib.metadata entrypoints.
|
3. Calls discover to load third-party plugins via importlib.metadata entrypoints.
|
||||||
4. Calls log_summary to emit a startup summary.
|
|
||||||
|
|
||||||
Subsequent calls return the same instance immediately.
|
Subsequent calls return the same instance immediately.
|
||||||
|
|
||||||
@@ -85,14 +86,15 @@ def get_parser_registry() -> ParserRegistry:
|
|||||||
"""
|
"""
|
||||||
global _registry, _discovery_complete
|
global _registry, _discovery_complete
|
||||||
|
|
||||||
if _registry is None:
|
with _lock:
|
||||||
_registry = ParserRegistry()
|
if _registry is None:
|
||||||
_registry.register_defaults()
|
r = ParserRegistry()
|
||||||
|
r.register_defaults()
|
||||||
|
_registry = r
|
||||||
|
|
||||||
if not _discovery_complete:
|
if not _discovery_complete:
|
||||||
_registry.discover()
|
_registry.discover()
|
||||||
_registry.log_summary()
|
_discovery_complete = True
|
||||||
_discovery_complete = True
|
|
||||||
|
|
||||||
return _registry
|
return _registry
|
||||||
|
|
||||||
@@ -113,9 +115,11 @@ def init_builtin_parsers() -> None:
|
|||||||
"""
|
"""
|
||||||
global _registry
|
global _registry
|
||||||
|
|
||||||
if _registry is None:
|
with _lock:
|
||||||
_registry = ParserRegistry()
|
if _registry is None:
|
||||||
_registry.register_defaults()
|
r = ParserRegistry()
|
||||||
|
r.register_defaults()
|
||||||
|
_registry = r
|
||||||
|
|
||||||
|
|
||||||
def reset_parser_registry() -> None:
|
def reset_parser_registry() -> None:
|
||||||
@@ -304,6 +308,23 @@ class ParserRegistry:
|
|||||||
getattr(cls, "url", "unknown"),
|
getattr(cls, "url", "unknown"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Inspection helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def all_parsers(self) -> list[type[ParserProtocol]]:
|
||||||
|
"""Return all registered parser classes (external first, then builtins).
|
||||||
|
|
||||||
|
Used by compatibility wrappers that need to iterate every parser to
|
||||||
|
compute the full set of supported MIME types and file extensions.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[type[ParserProtocol]]
|
||||||
|
External parsers followed by built-in parsers.
|
||||||
|
"""
|
||||||
|
return [*self._external, *self._builtins]
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Parser resolution
|
# Parser resolution
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
@@ -334,7 +355,7 @@ class ParserRegistry:
|
|||||||
mime_type:
|
mime_type:
|
||||||
The detected MIME type of the file.
|
The detected MIME type of the file.
|
||||||
filename:
|
filename:
|
||||||
The original filename, including extension.
|
The original filename, including extension. May be empty in some cases
|
||||||
path:
|
path:
|
||||||
Optional filesystem path to the file. Forwarded to each
|
Optional filesystem path to the file. Forwarded to each
|
||||||
parser's score method.
|
parser's score method.
|
||||||
|
|||||||
@@ -121,10 +121,7 @@ INSTALLED_APPS = [
|
|||||||
"django_extensions",
|
"django_extensions",
|
||||||
"paperless",
|
"paperless",
|
||||||
"documents.apps.DocumentsConfig",
|
"documents.apps.DocumentsConfig",
|
||||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
|
||||||
"paperless_text.apps.PaperlessTextConfig",
|
|
||||||
"paperless_mail.apps.PaperlessMailConfig",
|
"paperless_mail.apps.PaperlessMailConfig",
|
||||||
"paperless_remote.apps.PaperlessRemoteParserConfig",
|
|
||||||
"django.contrib.admin",
|
"django.contrib.admin",
|
||||||
"rest_framework",
|
"rest_framework",
|
||||||
"rest_framework.authtoken",
|
"rest_framework.authtoken",
|
||||||
@@ -974,8 +971,8 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
|||||||
"http://localhost:3000",
|
"http://localhost:3000",
|
||||||
)
|
)
|
||||||
|
|
||||||
if TIKA_ENABLED:
|
# Tika parser is now integrated into the main parser registry
|
||||||
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
# No separate Django app needed
|
||||||
|
|
||||||
AUDIT_LOG_ENABLED = get_bool_from_env("PAPERLESS_AUDIT_LOG_ENABLED", "true")
|
AUDIT_LOG_ENABLED = get_bool_from_env("PAPERLESS_AUDIT_LOG_ENABLED", "true")
|
||||||
if AUDIT_LOG_ENABLED:
|
if AUDIT_LOG_ENABLED:
|
||||||
|
|||||||
@@ -90,35 +90,6 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
|||||||
yield parser
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Remote parser sample files
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def remote_samples_dir(samples_dir: Path) -> Path:
|
|
||||||
"""Absolute path to the remote parser sample files directory.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
``<samples_dir>/remote/``
|
|
||||||
"""
|
|
||||||
return samples_dir / "remote"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def sample_pdf_file(remote_samples_dir: Path) -> Path:
|
|
||||||
"""Path to a simple digital PDF sample file.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``remote/simple-digital.pdf``.
|
|
||||||
"""
|
|
||||||
return remote_samples_dir / "simple-digital.pdf"
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Remote parser instance
|
# Remote parser instance
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|||||||
@@ -277,20 +277,20 @@ class TestRemoteParserParse:
|
|||||||
def test_parse_returns_text_from_azure(
|
def test_parse_returns_text_from_azure(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
azure_client: Mock,
|
azure_client: Mock,
|
||||||
) -> None:
|
) -> None:
|
||||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
assert remote_parser.get_text() == _DEFAULT_TEXT
|
assert remote_parser.get_text() == _DEFAULT_TEXT
|
||||||
|
|
||||||
def test_parse_sets_archive_path(
|
def test_parse_sets_archive_path(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
azure_client: Mock,
|
azure_client: Mock,
|
||||||
) -> None:
|
) -> None:
|
||||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
archive = remote_parser.get_archive_path()
|
archive = remote_parser.get_archive_path()
|
||||||
assert archive is not None
|
assert archive is not None
|
||||||
@@ -300,11 +300,11 @@ class TestRemoteParserParse:
|
|||||||
def test_parse_closes_client_on_success(
|
def test_parse_closes_client_on_success(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
azure_client: Mock,
|
azure_client: Mock,
|
||||||
) -> None:
|
) -> None:
|
||||||
remote_parser.configure(ParserContext())
|
remote_parser.configure(ParserContext())
|
||||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
azure_client.close.assert_called_once()
|
azure_client.close.assert_called_once()
|
||||||
|
|
||||||
@@ -312,9 +312,9 @@ class TestRemoteParserParse:
|
|||||||
def test_parse_sets_empty_text_when_not_configured(
|
def test_parse_sets_empty_text_when_not_configured(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
assert remote_parser.get_text() == ""
|
assert remote_parser.get_text() == ""
|
||||||
assert remote_parser.get_archive_path() is None
|
assert remote_parser.get_archive_path() is None
|
||||||
@@ -328,10 +328,10 @@ class TestRemoteParserParse:
|
|||||||
def test_get_date_always_none(
|
def test_get_date_always_none(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
azure_client: Mock,
|
azure_client: Mock,
|
||||||
) -> None:
|
) -> None:
|
||||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
assert remote_parser.get_date() is None
|
assert remote_parser.get_date() is None
|
||||||
|
|
||||||
@@ -345,33 +345,33 @@ class TestRemoteParserParseError:
|
|||||||
def test_parse_returns_none_on_azure_error(
|
def test_parse_returns_none_on_azure_error(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
failing_azure_client: Mock,
|
failing_azure_client: Mock,
|
||||||
) -> None:
|
) -> None:
|
||||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
assert remote_parser.get_text() is None
|
assert remote_parser.get_text() is None
|
||||||
|
|
||||||
def test_parse_closes_client_on_error(
|
def test_parse_closes_client_on_error(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
failing_azure_client: Mock,
|
failing_azure_client: Mock,
|
||||||
) -> None:
|
) -> None:
|
||||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
failing_azure_client.close.assert_called_once()
|
failing_azure_client.close.assert_called_once()
|
||||||
|
|
||||||
def test_parse_logs_error_on_azure_failure(
|
def test_parse_logs_error_on_azure_failure(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
failing_azure_client: Mock,
|
failing_azure_client: Mock,
|
||||||
mocker: MockerFixture,
|
mocker: MockerFixture,
|
||||||
) -> None:
|
) -> None:
|
||||||
mock_log = mocker.patch("paperless.parsers.remote.logger")
|
mock_log = mocker.patch("paperless.parsers.remote.logger")
|
||||||
|
|
||||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
mock_log.error.assert_called_once()
|
mock_log.error.assert_called_once()
|
||||||
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
|
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
|
||||||
@@ -386,18 +386,18 @@ class TestRemoteParserPageCount:
|
|||||||
def test_page_count_for_pdf(
|
def test_page_count_for_pdf(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
|
count = remote_parser.get_page_count(simple_digital_pdf_file, "application/pdf")
|
||||||
assert isinstance(count, int)
|
assert isinstance(count, int)
|
||||||
assert count >= 1
|
assert count >= 1
|
||||||
|
|
||||||
def test_page_count_returns_none_for_image_mime(
|
def test_page_count_returns_none_for_image_mime(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
count = remote_parser.get_page_count(sample_pdf_file, "image/png")
|
count = remote_parser.get_page_count(simple_digital_pdf_file, "image/png")
|
||||||
assert count is None
|
assert count is None
|
||||||
|
|
||||||
def test_page_count_returns_none_for_invalid_pdf(
|
def test_page_count_returns_none_for_invalid_pdf(
|
||||||
@@ -420,25 +420,31 @@ class TestRemoteParserMetadata:
|
|||||||
def test_extract_metadata_non_pdf_returns_empty(
|
def test_extract_metadata_non_pdf_returns_empty(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
|
result = remote_parser.extract_metadata(simple_digital_pdf_file, "image/png")
|
||||||
assert result == []
|
assert result == []
|
||||||
|
|
||||||
def test_extract_metadata_pdf_returns_list(
|
def test_extract_metadata_pdf_returns_list(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
result = remote_parser.extract_metadata(
|
||||||
|
simple_digital_pdf_file,
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
assert isinstance(result, list)
|
assert isinstance(result, list)
|
||||||
|
|
||||||
def test_extract_metadata_pdf_entries_have_required_keys(
|
def test_extract_metadata_pdf_entries_have_required_keys(
|
||||||
self,
|
self,
|
||||||
remote_parser: RemoteDocumentParser,
|
remote_parser: RemoteDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
result = remote_parser.extract_metadata(
|
||||||
|
simple_digital_pdf_file,
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
for entry in result:
|
for entry in result:
|
||||||
assert "namespace" in entry
|
assert "namespace" in entry
|
||||||
assert "prefix" in entry
|
assert "prefix" in entry
|
||||||
|
|||||||
@@ -77,10 +77,10 @@ class TestTikaParserRegistryInterface:
|
|||||||
def test_get_page_count_returns_int_with_pdf_archive(
|
def test_get_page_count_returns_int_with_pdf_archive(
|
||||||
self,
|
self,
|
||||||
tika_parser: TikaDocumentParser,
|
tika_parser: TikaDocumentParser,
|
||||||
sample_pdf_file: Path,
|
simple_digital_pdf_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
tika_parser._archive_path = sample_pdf_file
|
tika_parser._archive_path = simple_digital_pdf_file
|
||||||
count = tika_parser.get_page_count(sample_pdf_file, "application/pdf")
|
count = tika_parser.get_page_count(simple_digital_pdf_file, "application/pdf")
|
||||||
assert isinstance(count, int)
|
assert isinstance(count, int)
|
||||||
assert count > 0
|
assert count > 0
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
@@ -5,6 +5,7 @@ from pathlib import Path
|
|||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from django.core.checks import ERROR
|
||||||
from django.core.checks import Error
|
from django.core.checks import Error
|
||||||
from django.core.checks import Warning
|
from django.core.checks import Warning
|
||||||
from pytest_django.fixtures import SettingsWrapper
|
from pytest_django.fixtures import SettingsWrapper
|
||||||
@@ -12,7 +13,9 @@ from pytest_mock import MockerFixture
|
|||||||
|
|
||||||
from paperless.checks import audit_log_check
|
from paperless.checks import audit_log_check
|
||||||
from paperless.checks import binaries_check
|
from paperless.checks import binaries_check
|
||||||
|
from paperless.checks import check_default_language_available
|
||||||
from paperless.checks import check_deprecated_db_settings
|
from paperless.checks import check_deprecated_db_settings
|
||||||
|
from paperless.checks import check_remote_parser_configured
|
||||||
from paperless.checks import check_v3_minimum_upgrade_version
|
from paperless.checks import check_v3_minimum_upgrade_version
|
||||||
from paperless.checks import debug_mode_check
|
from paperless.checks import debug_mode_check
|
||||||
from paperless.checks import paths_check
|
from paperless.checks import paths_check
|
||||||
@@ -626,3 +629,116 @@ class TestV3MinimumUpgradeVersionCheck:
|
|||||||
conn.introspection.table_names.side_effect = OperationalError("DB unavailable")
|
conn.introspection.table_names.side_effect = OperationalError("DB unavailable")
|
||||||
mocker.patch.dict("paperless.checks.connections", {"default": conn})
|
mocker.patch.dict("paperless.checks.connections", {"default": conn})
|
||||||
assert check_v3_minimum_upgrade_version(None) == []
|
assert check_v3_minimum_upgrade_version(None) == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserChecks:
|
||||||
|
def test_no_engine(self, settings: SettingsWrapper) -> None:
|
||||||
|
settings.REMOTE_OCR_ENGINE = None
|
||||||
|
msgs = check_remote_parser_configured(None)
|
||||||
|
|
||||||
|
assert len(msgs) == 0
|
||||||
|
|
||||||
|
def test_azure_no_endpoint(self, settings: SettingsWrapper) -> None:
|
||||||
|
|
||||||
|
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||||
|
settings.REMOTE_OCR_API_KEY = "somekey"
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = None
|
||||||
|
|
||||||
|
msgs = check_remote_parser_configured(None)
|
||||||
|
|
||||||
|
assert len(msgs) == 1
|
||||||
|
|
||||||
|
msg = msgs[0]
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"Azure AI remote parser requires endpoint and API key to be configured."
|
||||||
|
in msg.msg
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestTesseractChecks:
|
||||||
|
def test_default_language(self) -> None:
|
||||||
|
check_default_language_available(None)
|
||||||
|
|
||||||
|
def test_no_language(self, settings: SettingsWrapper) -> None:
|
||||||
|
|
||||||
|
settings.OCR_LANGUAGE = ""
|
||||||
|
|
||||||
|
msgs = check_default_language_available(None)
|
||||||
|
|
||||||
|
assert len(msgs) == 1
|
||||||
|
msg = msgs[0]
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE" in msg.msg
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_invalid_language(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
settings.OCR_LANGUAGE = "ita"
|
||||||
|
|
||||||
|
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||||
|
tesser_lang_mock.return_value = ["deu", "eng"]
|
||||||
|
|
||||||
|
msgs = check_default_language_available(None)
|
||||||
|
|
||||||
|
assert len(msgs) == 1
|
||||||
|
msg = msgs[0]
|
||||||
|
|
||||||
|
assert msg.level == ERROR
|
||||||
|
assert "The selected ocr language ita is not installed" in msg.msg
|
||||||
|
|
||||||
|
def test_multi_part_language(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- An OCR language which is multi part (ie chi-sim)
|
||||||
|
- The language is correctly formatted
|
||||||
|
WHEN:
|
||||||
|
- Installed packages are checked
|
||||||
|
THEN:
|
||||||
|
- No errors are reported
|
||||||
|
"""
|
||||||
|
|
||||||
|
settings.OCR_LANGUAGE = "chi_sim"
|
||||||
|
|
||||||
|
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||||
|
tesser_lang_mock.return_value = ["chi_sim", "eng"]
|
||||||
|
|
||||||
|
msgs = check_default_language_available(None)
|
||||||
|
|
||||||
|
assert len(msgs) == 0
|
||||||
|
|
||||||
|
def test_multi_part_language_bad_format(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- An OCR language which is multi part (ie chi-sim)
|
||||||
|
- The language is correctly NOT formatted
|
||||||
|
WHEN:
|
||||||
|
- Installed packages are checked
|
||||||
|
THEN:
|
||||||
|
- No errors are reported
|
||||||
|
"""
|
||||||
|
settings.OCR_LANGUAGE = "chi-sim"
|
||||||
|
|
||||||
|
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||||
|
tesser_lang_mock.return_value = ["chi_sim", "eng"]
|
||||||
|
|
||||||
|
msgs = check_default_language_available(None)
|
||||||
|
|
||||||
|
assert len(msgs) == 1
|
||||||
|
msg = msgs[0]
|
||||||
|
|
||||||
|
assert msg.level == ERROR
|
||||||
|
assert "The selected ocr language chi-sim is not installed" in msg.msg
|
||||||
|
|||||||
@@ -1,18 +1,8 @@
|
|||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
from django.conf import settings
|
|
||||||
from django.utils.translation import gettext_lazy as _
|
from django.utils.translation import gettext_lazy as _
|
||||||
|
|
||||||
from paperless_mail.signals import mail_consumer_declaration
|
|
||||||
|
|
||||||
|
|
||||||
class PaperlessMailConfig(AppConfig):
|
class PaperlessMailConfig(AppConfig):
|
||||||
name = "paperless_mail"
|
name = "paperless_mail"
|
||||||
|
|
||||||
verbose_name = _("Paperless mail")
|
verbose_name = _("Paperless mail")
|
||||||
|
|
||||||
def ready(self) -> None:
|
|
||||||
from documents.signals import document_consumer_declaration
|
|
||||||
|
|
||||||
if settings.TIKA_ENABLED:
|
|
||||||
document_consumer_declaration.connect(mail_consumer_declaration)
|
|
||||||
AppConfig.ready(self)
|
|
||||||
|
|||||||
@@ -1,19 +0,0 @@
|
|||||||
def get_parser(*args, **kwargs):
|
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
|
||||||
|
|
||||||
# MailDocumentParser accepts no constructor args in the new-style protocol.
|
|
||||||
# Pop legacy args that arrive from the signal-based consumer path.
|
|
||||||
# Phase 4 will replace this signal path with the ParserRegistry.
|
|
||||||
kwargs.pop("logging_group", None)
|
|
||||||
kwargs.pop("progress_callback", None)
|
|
||||||
return MailDocumentParser()
|
|
||||||
|
|
||||||
|
|
||||||
def mail_consumer_declaration(sender, **kwargs):
|
|
||||||
return {
|
|
||||||
"parser": get_parser,
|
|
||||||
"weight": 20,
|
|
||||||
"mime_types": {
|
|
||||||
"message/rfc822": ".eml",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
# this is here so that django finds the checks.
|
|
||||||
from paperless_remote.checks import check_remote_parser_configured
|
|
||||||
|
|
||||||
__all__ = ["check_remote_parser_configured"]
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
from django.apps import AppConfig
|
|
||||||
|
|
||||||
from paperless_remote.signals import remote_consumer_declaration
|
|
||||||
|
|
||||||
|
|
||||||
class PaperlessRemoteParserConfig(AppConfig):
|
|
||||||
name = "paperless_remote"
|
|
||||||
|
|
||||||
def ready(self) -> None:
|
|
||||||
from documents.signals import document_consumer_declaration
|
|
||||||
|
|
||||||
document_consumer_declaration.connect(remote_consumer_declaration)
|
|
||||||
|
|
||||||
AppConfig.ready(self)
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
from django.conf import settings
|
|
||||||
from django.core.checks import Error
|
|
||||||
from django.core.checks import register
|
|
||||||
|
|
||||||
|
|
||||||
@register()
|
|
||||||
def check_remote_parser_configured(app_configs, **kwargs):
|
|
||||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
|
||||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
|
||||||
):
|
|
||||||
return [
|
|
||||||
Error(
|
|
||||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
return []
|
|
||||||
@@ -1,38 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
|
||||||
from paperless.parsers.remote import RemoteDocumentParser
|
|
||||||
|
|
||||||
# The new RemoteDocumentParser does not accept the progress_callback
|
|
||||||
# kwarg injected by the old signal-based consumer. logging_group is
|
|
||||||
# forwarded as a positional arg.
|
|
||||||
# Phase 4 will replace this signal path with the new ParserRegistry.
|
|
||||||
kwargs.pop("progress_callback", None)
|
|
||||||
return RemoteDocumentParser(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def get_supported_mime_types() -> dict[str, str]:
|
|
||||||
from django.conf import settings
|
|
||||||
|
|
||||||
from paperless.parsers.remote import RemoteDocumentParser
|
|
||||||
from paperless.parsers.remote import RemoteEngineConfig
|
|
||||||
|
|
||||||
config = RemoteEngineConfig(
|
|
||||||
engine=settings.REMOTE_OCR_ENGINE,
|
|
||||||
api_key=settings.REMOTE_OCR_API_KEY,
|
|
||||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
|
||||||
)
|
|
||||||
if not config.engine_is_valid():
|
|
||||||
return {}
|
|
||||||
return RemoteDocumentParser.supported_mime_types()
|
|
||||||
|
|
||||||
|
|
||||||
def remote_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"parser": get_parser,
|
|
||||||
"weight": 5,
|
|
||||||
"mime_types": get_supported_mime_types(),
|
|
||||||
}
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
from unittest import TestCase
|
|
||||||
|
|
||||||
from django.test import override_settings
|
|
||||||
|
|
||||||
from paperless_remote import check_remote_parser_configured
|
|
||||||
|
|
||||||
|
|
||||||
class TestChecks(TestCase):
|
|
||||||
@override_settings(REMOTE_OCR_ENGINE=None)
|
|
||||||
def test_no_engine(self) -> None:
|
|
||||||
msgs = check_remote_parser_configured(None)
|
|
||||||
self.assertEqual(len(msgs), 0)
|
|
||||||
|
|
||||||
@override_settings(REMOTE_OCR_ENGINE="azureai")
|
|
||||||
@override_settings(REMOTE_OCR_API_KEY="somekey")
|
|
||||||
@override_settings(REMOTE_OCR_ENDPOINT=None)
|
|
||||||
def test_azure_no_endpoint(self) -> None:
|
|
||||||
msgs = check_remote_parser_configured(None)
|
|
||||||
self.assertEqual(len(msgs), 1)
|
|
||||||
self.assertTrue(
|
|
||||||
msgs[0].msg.startswith(
|
|
||||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
# this is here so that django finds the checks.
|
|
||||||
from paperless_tesseract.checks import check_default_language_available
|
|
||||||
from paperless_tesseract.checks import get_tesseract_langs
|
|
||||||
|
|
||||||
__all__ = ["check_default_language_available", "get_tesseract_langs"]
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
from django.apps import AppConfig
|
|
||||||
|
|
||||||
from paperless_tesseract.signals import tesseract_consumer_declaration
|
|
||||||
|
|
||||||
|
|
||||||
class PaperlessTesseractConfig(AppConfig):
|
|
||||||
name = "paperless_tesseract"
|
|
||||||
|
|
||||||
def ready(self) -> None:
|
|
||||||
from documents.signals import document_consumer_declaration
|
|
||||||
|
|
||||||
document_consumer_declaration.connect(tesseract_consumer_declaration)
|
|
||||||
|
|
||||||
AppConfig.ready(self)
|
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
import shutil
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
from django.core.checks import Error
|
|
||||||
from django.core.checks import Warning
|
|
||||||
from django.core.checks import register
|
|
||||||
|
|
||||||
|
|
||||||
def get_tesseract_langs():
|
|
||||||
proc = subprocess.run(
|
|
||||||
[shutil.which("tesseract"), "--list-langs"],
|
|
||||||
capture_output=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Decode bytes to string, split on newlines, trim out the header
|
|
||||||
proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
|
|
||||||
|
|
||||||
return [x.strip() for x in proc_lines]
|
|
||||||
|
|
||||||
|
|
||||||
@register()
|
|
||||||
def check_default_language_available(app_configs, **kwargs):
|
|
||||||
errs = []
|
|
||||||
|
|
||||||
if not settings.OCR_LANGUAGE:
|
|
||||||
errs.append(
|
|
||||||
Warning(
|
|
||||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
|
||||||
"This means that tesseract will fallback to english.",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
return errs
|
|
||||||
|
|
||||||
# binaries_check in paperless will check and report if this doesn't exist
|
|
||||||
# So skip trying to do anything here and let that handle missing binaries
|
|
||||||
if shutil.which("tesseract") is not None:
|
|
||||||
installed_langs = get_tesseract_langs()
|
|
||||||
|
|
||||||
specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")]
|
|
||||||
|
|
||||||
for lang in specified_langs:
|
|
||||||
if lang not in installed_langs:
|
|
||||||
errs.append(
|
|
||||||
Error(
|
|
||||||
f"The selected ocr language {lang} is "
|
|
||||||
f"not installed. Paperless cannot OCR your documents "
|
|
||||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
return errs
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
|
||||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
|
||||||
|
|
||||||
# RasterisedDocumentParser accepts logging_group for constructor compatibility but
|
|
||||||
# does not store or use it (no legacy DocumentParser base class).
|
|
||||||
# progress_callback is also not used. Both may arrive as a positional arg
|
|
||||||
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
|
||||||
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
|
||||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
|
||||||
kwargs.pop("logging_group", None)
|
|
||||||
kwargs.pop("progress_callback", None)
|
|
||||||
return RasterisedDocumentParser(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def tesseract_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"parser": get_parser,
|
|
||||||
"weight": 0,
|
|
||||||
"mime_types": {
|
|
||||||
"application/pdf": ".pdf",
|
|
||||||
"image/jpeg": ".jpg",
|
|
||||||
"image/png": ".png",
|
|
||||||
"image/tiff": ".tif",
|
|
||||||
"image/gif": ".gif",
|
|
||||||
"image/bmp": ".bmp",
|
|
||||||
"image/webp": ".webp",
|
|
||||||
"image/heic": ".heic",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
@@ -1,67 +0,0 @@
|
|||||||
from unittest import mock
|
|
||||||
|
|
||||||
from django.core.checks import ERROR
|
|
||||||
from django.test import TestCase
|
|
||||||
from django.test import override_settings
|
|
||||||
|
|
||||||
from paperless_tesseract import check_default_language_available
|
|
||||||
|
|
||||||
|
|
||||||
class TestChecks(TestCase):
|
|
||||||
def test_default_language(self) -> None:
|
|
||||||
check_default_language_available(None)
|
|
||||||
|
|
||||||
@override_settings(OCR_LANGUAGE="")
|
|
||||||
def test_no_language(self) -> None:
|
|
||||||
msgs = check_default_language_available(None)
|
|
||||||
self.assertEqual(len(msgs), 1)
|
|
||||||
self.assertTrue(
|
|
||||||
msgs[0].msg.startswith(
|
|
||||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_LANGUAGE="ita")
|
|
||||||
@mock.patch("paperless_tesseract.checks.get_tesseract_langs")
|
|
||||||
def test_invalid_language(self, m) -> None:
|
|
||||||
m.return_value = ["deu", "eng"]
|
|
||||||
msgs = check_default_language_available(None)
|
|
||||||
self.assertEqual(len(msgs), 1)
|
|
||||||
self.assertEqual(msgs[0].level, ERROR)
|
|
||||||
|
|
||||||
@override_settings(OCR_LANGUAGE="chi_sim")
|
|
||||||
@mock.patch("paperless_tesseract.checks.get_tesseract_langs")
|
|
||||||
def test_multi_part_language(self, m) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- An OCR language which is multi part (ie chi-sim)
|
|
||||||
- The language is correctly formatted
|
|
||||||
WHEN:
|
|
||||||
- Installed packages are checked
|
|
||||||
THEN:
|
|
||||||
- No errors are reported
|
|
||||||
"""
|
|
||||||
m.return_value = ["chi_sim", "eng"]
|
|
||||||
|
|
||||||
msgs = check_default_language_available(None)
|
|
||||||
|
|
||||||
self.assertEqual(len(msgs), 0)
|
|
||||||
|
|
||||||
@override_settings(OCR_LANGUAGE="chi-sim")
|
|
||||||
@mock.patch("paperless_tesseract.checks.get_tesseract_langs")
|
|
||||||
def test_multi_part_language_bad_format(self, m) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- An OCR language which is multi part (ie chi-sim)
|
|
||||||
- The language is correctly NOT formatted
|
|
||||||
WHEN:
|
|
||||||
- Installed packages are checked
|
|
||||||
THEN:
|
|
||||||
- No errors are reported
|
|
||||||
"""
|
|
||||||
m.return_value = ["chi_sim", "eng"]
|
|
||||||
|
|
||||||
msgs = check_default_language_available(None)
|
|
||||||
|
|
||||||
self.assertEqual(len(msgs), 1)
|
|
||||||
self.assertEqual(msgs[0].level, ERROR)
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
from django.apps import AppConfig
|
|
||||||
|
|
||||||
from paperless_text.signals import text_consumer_declaration
|
|
||||||
|
|
||||||
|
|
||||||
class PaperlessTextConfig(AppConfig):
|
|
||||||
name = "paperless_text"
|
|
||||||
|
|
||||||
def ready(self) -> None:
|
|
||||||
from documents.signals import document_consumer_declaration
|
|
||||||
|
|
||||||
document_consumer_declaration.connect(text_consumer_declaration)
|
|
||||||
|
|
||||||
AppConfig.ready(self)
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
|
||||||
from paperless.parsers.text import TextDocumentParser
|
|
||||||
|
|
||||||
# TextDocumentParser accepts logging_group for constructor compatibility but
|
|
||||||
# does not store or use it (no legacy DocumentParser base class).
|
|
||||||
# progress_callback is also not used. Both may arrive as a positional arg
|
|
||||||
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
|
||||||
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
|
||||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
|
||||||
kwargs.pop("logging_group", None)
|
|
||||||
kwargs.pop("progress_callback", None)
|
|
||||||
return TextDocumentParser(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def text_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"parser": get_parser,
|
|
||||||
"weight": 10,
|
|
||||||
"mime_types": {
|
|
||||||
"text/plain": ".txt",
|
|
||||||
"text/csv": ".csv",
|
|
||||||
"application/csv": ".csv",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
from django.apps import AppConfig
|
|
||||||
from django.conf import settings
|
|
||||||
|
|
||||||
from paperless_tika.signals import tika_consumer_declaration
|
|
||||||
|
|
||||||
|
|
||||||
class PaperlessTikaConfig(AppConfig):
|
|
||||||
name = "paperless_tika"
|
|
||||||
|
|
||||||
def ready(self) -> None:
|
|
||||||
from documents.signals import document_consumer_declaration
|
|
||||||
|
|
||||||
if settings.TIKA_ENABLED:
|
|
||||||
document_consumer_declaration.connect(tika_consumer_declaration)
|
|
||||||
AppConfig.ready(self)
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
def get_parser(*args, **kwargs):
|
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
|
||||||
|
|
||||||
# TikaDocumentParser accepts logging_group for constructor compatibility but
|
|
||||||
# does not store or use it (no legacy DocumentParser base class).
|
|
||||||
# progress_callback is also not used. Both may arrive as a positional arg
|
|
||||||
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
|
||||||
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
|
||||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
|
||||||
kwargs.pop("logging_group", None)
|
|
||||||
kwargs.pop("progress_callback", None)
|
|
||||||
return TikaDocumentParser()
|
|
||||||
|
|
||||||
|
|
||||||
def tika_consumer_declaration(sender, **kwargs):
|
|
||||||
return {
|
|
||||||
"parser": get_parser,
|
|
||||||
"weight": 10,
|
|
||||||
"mime_types": {
|
|
||||||
"application/msword": ".doc",
|
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
||||||
"application/vnd.ms-excel": ".xls",
|
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
||||||
"application/vnd.ms-powerpoint": ".ppt",
|
|
||||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
||||||
"application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
|
|
||||||
"application/vnd.oasis.opendocument.presentation": ".odp",
|
|
||||||
"application/vnd.oasis.opendocument.spreadsheet": ".ods",
|
|
||||||
"application/vnd.oasis.opendocument.text": ".odt",
|
|
||||||
"application/vnd.oasis.opendocument.graphics": ".odg",
|
|
||||||
"text/rtf": ".rtf",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user