mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-24 01:42:44 +00:00
Compare commits
3 Commits
fix-drop-s
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6191a8af9b | ||
|
|
79def8a200 | ||
|
|
701735f6e5 |
@@ -2437,17 +2437,3 @@ src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "Non
|
||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "user_args" [union-attr]
|
||||
src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_text/parsers.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "None") [assignment]
|
||||
src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Argument 1 to "make_thumbnail_from_pdf" has incompatible type "None"; expected "Path" [arg-type]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a return type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "None") [assignment]
|
||||
src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
|
||||
@@ -269,10 +269,6 @@ testpaths = [
|
||||
"src/documents/tests/",
|
||||
"src/paperless/tests/",
|
||||
"src/paperless_mail/tests/",
|
||||
"src/paperless_tesseract/tests/",
|
||||
"src/paperless_tika/tests",
|
||||
"src/paperless_text/tests/",
|
||||
"src/paperless_remote/tests/",
|
||||
"src/paperless_ai/tests",
|
||||
]
|
||||
|
||||
|
||||
@@ -3,25 +3,20 @@ from django.core.checks import Error
|
||||
from django.core.checks import Warning
|
||||
from django.core.checks import register
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.templating.utils import convert_format_str_to_template_format
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
|
||||
@register()
|
||||
def parser_check(app_configs, **kwargs):
|
||||
parsers = []
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parsers.append(response[1])
|
||||
|
||||
if len(parsers) == 0:
|
||||
if not get_parser_registry().all_parsers():
|
||||
return [
|
||||
Error(
|
||||
"No parsers found. This is a bug. The consumer won't be "
|
||||
"able to consume any documents without parsers.",
|
||||
),
|
||||
]
|
||||
else:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
@register()
|
||||
|
||||
@@ -32,9 +32,7 @@ from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.permissions import set_permissions_for_object
|
||||
from documents.plugins.base import AlwaysRunPluginMixin
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
@@ -52,40 +50,12 @@ from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
|
||||
|
||||
def _parser_cleanup(parser: DocumentParser) -> None:
|
||||
"""
|
||||
Call cleanup on a parser, handling the new-style context-manager parsers.
|
||||
|
||||
New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown
|
||||
instead of a cleanup() method. This shim will be removed once all existing parsers
|
||||
have switched to the new style and this consumer is updated to use it
|
||||
|
||||
TODO(stumpylog): Remove me in the future
|
||||
"""
|
||||
if isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class WorkflowTriggerPlugin(
|
||||
NoCleanupPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
@@ -422,8 +392,12 @@ class ConsumerPlugin(
|
||||
self.log.error(f"Error attempting to clean PDF: {e}")
|
||||
|
||||
# Based on the mime type, get the parser for that type
|
||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
||||
mime_type,
|
||||
parser_class: type[ParserProtocol] | None = (
|
||||
get_parser_registry().get_parser_for_file(
|
||||
mime_type,
|
||||
self.filename,
|
||||
self.working_copy,
|
||||
)
|
||||
)
|
||||
if not parser_class:
|
||||
tempdir.cleanup()
|
||||
@@ -446,313 +420,275 @@ class ConsumerPlugin(
|
||||
tempdir.cleanup()
|
||||
raise
|
||||
|
||||
def progress_callback(
|
||||
current_progress,
|
||||
max_progress,
|
||||
) -> None: # pragma: no cover
|
||||
# recalculate progress to be within 20 and 80
|
||||
p = int((current_progress / max_progress) * 50 + 20)
|
||||
self._send_progress(p, 100, ProgressStatusOptions.WORKING)
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser: DocumentParser = parser_class(
|
||||
self.logging_group,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
document_parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# New-style parsers use __enter__/__exit__ for resource management.
|
||||
# _parser_cleanup (below) handles __exit__; call __enter__ here.
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
if parser_is_new_style:
|
||||
document_parser.__enter__()
|
||||
|
||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
with parser_class() as document_parser:
|
||||
document_parser.configure(
|
||||
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
document_parser.configure(
|
||||
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||
)
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
else:
|
||||
document_parser.parse(self.working_copy, mime_type, self.filename)
|
||||
self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}")
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
else:
|
||||
thumbnail = document_parser.get_thumbnail(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
self.filename,
|
||||
)
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
90,
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
)
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
except ParseError as e:
|
||||
_parser_cleanup(document_parser)
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
_parser_cleanup(document_parser)
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
|
||||
# Prepare the document classifier.
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
if self.input_doc.root_document_id:
|
||||
# If this is a new version of an existing document, we need
|
||||
# to make sure we're not creating a new document, but updating
|
||||
# the existing one.
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
self._send_progress(
|
||||
90,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
actor = None
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
)
|
||||
|
||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||
if (
|
||||
settings.AUDIT_LOG_ENABLED
|
||||
and self.metadata.actor_id is not None
|
||||
):
|
||||
actor = User.objects.filter(pk=self.metadata.actor_id).first()
|
||||
if actor is not None:
|
||||
from auditlog.context import ( # type: ignore[import-untyped]
|
||||
set_actor,
|
||||
)
|
||||
except ParseError as e:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
with set_actor(actor):
|
||||
# Prepare the document classifier.
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
if self.input_doc.root_document_id:
|
||||
# If this is a new version of an existing document, we need
|
||||
# to make sure we're not creating a new document, but updating
|
||||
# the existing one.
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
actor = None
|
||||
|
||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||
if (
|
||||
settings.AUDIT_LOG_ENABLED
|
||||
and self.metadata.actor_id is not None
|
||||
):
|
||||
actor = User.objects.filter(
|
||||
pk=self.metadata.actor_id,
|
||||
).first()
|
||||
if actor is not None:
|
||||
from auditlog.context import ( # type: ignore[import-untyped]
|
||||
set_actor,
|
||||
)
|
||||
|
||||
with set_actor(actor):
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
|
||||
# Create a log entry for the version addition, if enabled
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import ( # type: ignore[import-untyped]
|
||||
LogEntry,
|
||||
)
|
||||
|
||||
LogEntry.objects.log_create(
|
||||
instance=root_doc,
|
||||
changes={
|
||||
"Version Added": ["None", original_document.id],
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
actor=actor,
|
||||
additional_data={
|
||||
"reason": "Version added",
|
||||
"version_id": original_document.id,
|
||||
},
|
||||
)
|
||||
document = original_document
|
||||
else:
|
||||
original_document.save()
|
||||
|
||||
# Create a log entry for the version addition, if enabled
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import ( # type: ignore[import-untyped]
|
||||
LogEntry,
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
|
||||
LogEntry.objects.log_create(
|
||||
instance=root_doc,
|
||||
changes={
|
||||
"Version Added": ["None", original_document.id],
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
actor=actor,
|
||||
additional_data={
|
||||
"reason": "Version added",
|
||||
"version_id": original_document.id,
|
||||
},
|
||||
)
|
||||
document = original_document
|
||||
else:
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
)
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
generated_filename = generate_unique_filename(document)
|
||||
if (
|
||||
len(str(generated_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_filename = generate_filename(
|
||||
document,
|
||||
use_format=False,
|
||||
)
|
||||
document.filename = generated_filename
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
self._write(
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
generated_archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
generated_filename = generate_unique_filename(document)
|
||||
if (
|
||||
len(str(generated_archive_filename))
|
||||
len(str(generated_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_archive_filename = generate_filename(
|
||||
generated_filename = generate_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
use_format=False,
|
||||
)
|
||||
document.archive_filename = generated_archive_filename
|
||||
create_source_path_directory(document.archive_path)
|
||||
document.filename = generated_filename
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
self._write(
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
generated_archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
if (
|
||||
len(str(generated_archive_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_archive_filename = generate_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
use_format=False,
|
||||
)
|
||||
document.archive_filename = generated_archive_filename
|
||||
create_source_path_directory(document.archive_path)
|
||||
self._write(
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
)
|
||||
|
||||
if document.root_document_id:
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
document=document.root_document,
|
||||
)
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(f"Deleting original file {self.input_doc.original_file}")
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
|
||||
if document.root_document_id:
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
document=document.root_document,
|
||||
)
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
f"Deleting original file {self.input_doc.original_file}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
finally:
|
||||
_parser_cleanup(document_parser)
|
||||
tempdir.cleanup()
|
||||
finally:
|
||||
tempdir.cleanup()
|
||||
|
||||
self.run_post_consume_script(document)
|
||||
|
||||
|
||||
@@ -3,19 +3,18 @@ import shutil
|
||||
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.models import Document
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
logger = logging.getLogger("paperless.management.thumbnails")
|
||||
|
||||
|
||||
def _process_document(doc_id: int) -> None:
|
||||
document: Document = Document.objects.get(id=doc_id)
|
||||
parser_class = get_parser_class_for_mime_type(document.mime_type)
|
||||
parser_class = get_parser_registry().get_parser_for_file(
|
||||
document.mime_type,
|
||||
document.original_filename or "",
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
if parser_class is None:
|
||||
logger.warning(
|
||||
@@ -25,40 +24,9 @@ def _process_document(doc_id: int) -> None:
|
||||
)
|
||||
return
|
||||
|
||||
parser = parser_class(logging_group=None)
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.__enter__()
|
||||
|
||||
try:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||
else:
|
||||
thumb = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
document.mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
with parser_class() as parser:
|
||||
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||
shutil.move(thumb, document.thumbnail_path)
|
||||
finally:
|
||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
||||
if parser_is_new_style:
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
|
||||
@@ -3,84 +3,47 @@ from __future__ import annotations
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
|
||||
# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits
|
||||
|
||||
# TODO: isn't there a date parsing library for this?
|
||||
|
||||
DATE_REGEX = re.compile(
|
||||
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger("paperless.parsing")
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def is_mime_type_supported(mime_type: str) -> bool:
|
||||
"""
|
||||
Returns True if the mime type is supported, False otherwise
|
||||
"""
|
||||
return get_parser_class_for_mime_type(mime_type) is not None
|
||||
return get_parser_registry().get_parser_for_file(mime_type, "") is not None
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def get_default_file_extension(mime_type: str) -> str:
|
||||
"""
|
||||
Returns the default file extension for a mimetype, or
|
||||
an empty string if it could not be determined
|
||||
"""
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
return supported_mime_types[mime_type]
|
||||
parser_class = get_parser_registry().get_parser_for_file(mime_type, "")
|
||||
if parser_class is not None:
|
||||
supported = parser_class.supported_mime_types()
|
||||
if mime_type in supported:
|
||||
return supported[mime_type]
|
||||
|
||||
ext = mimetypes.guess_extension(mime_type)
|
||||
if ext:
|
||||
return ext
|
||||
else:
|
||||
return ""
|
||||
return ext if ext else ""
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def is_file_ext_supported(ext: str) -> bool:
|
||||
"""
|
||||
Returns True if the file extension is supported, False otherwise
|
||||
@@ -94,44 +57,17 @@ def is_file_ext_supported(ext: str) -> bool:
|
||||
|
||||
def get_supported_file_extensions() -> set[str]:
|
||||
extensions = set()
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
for mime_type in supported_mime_types:
|
||||
for parser_class in get_parser_registry().all_parsers():
|
||||
for mime_type, ext in parser_class.supported_mime_types().items():
|
||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||
# Python's stdlib might be behind, so also add what the parser
|
||||
# says is the default extension
|
||||
# This makes image/webp supported on Python < 3.11
|
||||
extensions.add(supported_mime_types[mime_type])
|
||||
extensions.add(ext)
|
||||
|
||||
return extensions
|
||||
|
||||
|
||||
def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
|
||||
"""
|
||||
Returns the best parser (by weight) for the given mimetype or
|
||||
None if no parser exists
|
||||
"""
|
||||
|
||||
options = []
|
||||
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
options.append(parser_declaration)
|
||||
|
||||
if not options:
|
||||
return None
|
||||
|
||||
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
|
||||
|
||||
# Return the parser with the highest weight.
|
||||
return best_parser["parser"]
|
||||
|
||||
|
||||
def run_convert(
|
||||
input_file,
|
||||
output_file,
|
||||
|
||||
@@ -2,5 +2,4 @@ from django.dispatch import Signal
|
||||
|
||||
document_consumption_started = Signal()
|
||||
document_consumption_finished = Signal()
|
||||
document_consumer_declaration = Signal()
|
||||
document_updated = Signal()
|
||||
|
||||
@@ -52,8 +52,6 @@ from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import WorkflowRun
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import ProgressManager
|
||||
from documents.plugins.base import StopConsumeTaskError
|
||||
@@ -66,11 +64,7 @@ from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
from paperless_ai.indexing import llm_index_remove_document
|
||||
from paperless_ai.indexing import update_llm_index
|
||||
@@ -310,8 +304,10 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
|
||||
mime_type = document.mime_type
|
||||
|
||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
||||
parser_class = get_parser_registry().get_parser_for_file(
|
||||
mime_type,
|
||||
document.original_filename or "",
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
if not parser_class:
|
||||
@@ -321,138 +317,92 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
)
|
||||
return
|
||||
|
||||
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
|
||||
with parser_class() as parser:
|
||||
parser.configure(ParserContext())
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.__enter__()
|
||||
|
||||
try:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.configure(ParserContext())
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type)
|
||||
else:
|
||||
parser.parse(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||
else:
|
||||
thumbnail = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text(),
|
||||
archive_filename=document.archive_filename,
|
||||
)
|
||||
newDocument = Document.objects.get(pk=document.pk)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, newDocument.content],
|
||||
"archive_checksum": [
|
||||
oldDocument.archive_checksum,
|
||||
newDocument.archive_checksum,
|
||||
],
|
||||
"archive_filename": [
|
||||
oldDocument.archive_filename,
|
||||
newDocument.archive_filename,
|
||||
],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
else:
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
content=parser.get_text(),
|
||||
)
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, parser.get_text()],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
shutil.move(thumbnail, document.thumbnail_path)
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text(),
|
||||
archive_filename=document.archive_filename,
|
||||
)
|
||||
newDocument = Document.objects.get(pk=document.pk)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, newDocument.content],
|
||||
"archive_checksum": [
|
||||
oldDocument.archive_checksum,
|
||||
newDocument.archive_checksum,
|
||||
],
|
||||
"archive_filename": [
|
||||
oldDocument.archive_filename,
|
||||
newDocument.archive_filename,
|
||||
],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
else:
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
content=parser.get_text(),
|
||||
)
|
||||
|
||||
document.refresh_from_db()
|
||||
logger.info(
|
||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, parser.get_text()],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
llm_index_add_or_update_document(document)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
if parser.get_archive_path():
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
shutil.move(thumbnail, document.thumbnail_path)
|
||||
|
||||
clear_document_caches(document.pk)
|
||||
document.refresh_from_db()
|
||||
logger.info(
|
||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Error while parsing document {document} (ID: {document_id})",
|
||||
)
|
||||
finally:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
llm_index_add_or_update_document(document)
|
||||
|
||||
clear_document_caches(document.pk)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Error while parsing document {document} (ID: {document_id})",
|
||||
)
|
||||
|
||||
|
||||
@shared_task
|
||||
|
||||
@@ -13,8 +13,10 @@ class TestDocumentChecks(TestCase):
|
||||
def test_parser_check(self) -> None:
|
||||
self.assertEqual(parser_check(None), [])
|
||||
|
||||
with mock.patch("documents.checks.document_consumer_declaration.send") as m:
|
||||
m.return_value = []
|
||||
with mock.patch("documents.checks.get_parser_registry") as mock_registry_fn:
|
||||
mock_registry = mock.MagicMock()
|
||||
mock_registry.all_parsers.return_value = []
|
||||
mock_registry_fn.return_value = mock_registry
|
||||
|
||||
self.assertEqual(
|
||||
parser_check(None),
|
||||
|
||||
@@ -27,7 +27,6 @@ from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.tasks import sanity_check
|
||||
@@ -38,62 +37,106 @@ from documents.tests.utils import GetConsumerMixin
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
|
||||
class _BaseTestParser(DocumentParser):
|
||||
def get_settings(self) -> None:
|
||||
class _BaseNewStyleParser:
|
||||
"""Minimal ParserProtocol implementation for use in consumer tests."""
|
||||
|
||||
name: str = "test-parser"
|
||||
version: str = "0.1"
|
||||
author: str = "test"
|
||||
url: str = "test"
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict:
|
||||
return {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"message/rfc822": ".eml",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def score(cls, mime_type: str, filename: str, path=None):
|
||||
return 0 if mime_type in cls.supported_mime_types() else None
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
return False
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._tmpdir: Path | None = None
|
||||
self._text: str | None = None
|
||||
self._archive: Path | None = None
|
||||
self._thumb: Path | None = None
|
||||
|
||||
def __enter__(self):
|
||||
self._tmpdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-test-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
_, thumb = tempfile.mkstemp(suffix=".webp", dir=self._tmpdir)
|
||||
self._thumb = Path(thumb)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||
if self._tmpdir and self._tmpdir.exists():
|
||||
shutil.rmtree(self._tmpdir, ignore_errors=True)
|
||||
|
||||
def configure(self, context) -> None:
|
||||
"""
|
||||
This parser does not implement additional settings yet
|
||||
Test parser doesn't do anything with context
|
||||
"""
|
||||
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
return self._text
|
||||
|
||||
def get_date(self):
|
||||
return None
|
||||
|
||||
def get_archive_path(self):
|
||||
return self._archive
|
||||
|
||||
class DummyParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir, archive_path) -> None:
|
||||
super().__init__(logging_group, None)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
self.archive_path = archive_path
|
||||
def get_thumbnail(self, document_path, mime_type) -> Path:
|
||||
return self._thumb
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
def get_page_count(self, document_path, mime_type):
|
||||
return None
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
||||
self.text = "The Text"
|
||||
def extract_metadata(self, document_path, mime_type) -> list:
|
||||
return []
|
||||
|
||||
|
||||
class CopyParser(_BaseTestParser):
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
class DummyParser(_BaseNewStyleParser):
|
||||
_ARCHIVE_SRC = (
|
||||
Path(__file__).parent / "samples" / "documents" / "archive" / "0000001.pdf"
|
||||
)
|
||||
|
||||
def __init__(self, logging_group, progress_callback=None) -> None:
|
||||
super().__init__(logging_group, progress_callback)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir)
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
||||
self.text = "The text"
|
||||
self.archive_path = Path(self.tempdir / "archive.pdf")
|
||||
shutil.copy(document_path, self.archive_path)
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
self._text = "The Text"
|
||||
if produce_archive and self._tmpdir:
|
||||
self._archive = self._tmpdir / "archive.pdf"
|
||||
shutil.copy(self._ARCHIVE_SRC, self._archive)
|
||||
|
||||
|
||||
class FaultyParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir) -> None:
|
||||
super().__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
class CopyParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
self._text = "The text"
|
||||
if produce_archive and self._tmpdir:
|
||||
self._archive = self._tmpdir / "archive.pdf"
|
||||
shutil.copy(document_path, self._archive)
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
class FaultyParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
class FaultyGenericExceptionParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir) -> None:
|
||||
super().__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
class FaultyGenericExceptionParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise Exception("Generic exception.")
|
||||
|
||||
|
||||
@@ -147,38 +190,12 @@ class TestConsumer(
|
||||
self.assertEqual(payload["data"]["max_progress"], last_progress_max)
|
||||
self.assertEqual(payload["data"]["status"], last_status)
|
||||
|
||||
def make_dummy_parser(self, logging_group, progress_callback=None):
|
||||
return DummyParser(
|
||||
logging_group,
|
||||
self.dirs.scratch_dir,
|
||||
self.get_test_archive_file(),
|
||||
)
|
||||
|
||||
def make_faulty_parser(self, logging_group, progress_callback=None):
|
||||
return FaultyParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def make_faulty_generic_exception_parser(
|
||||
self,
|
||||
logging_group,
|
||||
progress_callback=None,
|
||||
):
|
||||
return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
|
||||
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
m = patcher.start()
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_dummy_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
patcher = mock.patch("documents.consumer.get_parser_registry")
|
||||
mock_registry = patcher.start()
|
||||
mock_registry.return_value.get_parser_for_file.return_value = DummyParser
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
def get_test_file(self):
|
||||
@@ -547,9 +564,9 @@ class TestConsumer(
|
||||
) as consumer:
|
||||
consumer.run()
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def testNoParsers(self, m) -> None:
|
||||
m.return_value = []
|
||||
m.return_value.get_parser_for_file.return_value = None
|
||||
|
||||
with self.assertRaisesMessage(
|
||||
ConsumerError,
|
||||
@@ -560,18 +577,9 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def testFaultyParser(self, m) -> None:
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_faulty_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value.get_parser_for_file.return_value = FaultyParser
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
with self.assertRaisesMessage(
|
||||
@@ -582,18 +590,9 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def testGenericParserException(self, m) -> None:
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_faulty_generic_exception_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value.get_parser_for_file.return_value = FaultyGenericExceptionParser
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
with self.assertRaisesMessage(
|
||||
@@ -1017,7 +1016,7 @@ class TestConsumer(
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{title}")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_similar_filenames(self, m) -> None:
|
||||
shutil.copy(
|
||||
Path(__file__).parent / "samples" / "simple.pdf",
|
||||
@@ -1031,16 +1030,7 @@ class TestConsumer(
|
||||
Path(__file__).parent / "samples" / "simple-noalpha.png",
|
||||
settings.CONSUMPTION_DIR / "simple.png.pdf",
|
||||
)
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": CopyParser,
|
||||
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value.get_parser_for_file.return_value = CopyParser
|
||||
|
||||
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer:
|
||||
consumer.run()
|
||||
@@ -1068,8 +1058,10 @@ class TestConsumer(
|
||||
|
||||
sanity_check()
|
||||
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
@mock.patch("documents.consumer.run_subprocess")
|
||||
def test_try_to_clean_invalid_pdf(self, m) -> None:
|
||||
def test_try_to_clean_invalid_pdf(self, m, mock_registry) -> None:
|
||||
mock_registry.return_value.get_parser_for_file.return_value = None
|
||||
shutil.copy(
|
||||
Path(__file__).parent / "samples" / "invalid_pdf.pdf",
|
||||
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
|
||||
@@ -1091,10 +1083,10 @@ class TestConsumer(
|
||||
|
||||
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
||||
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_mail_parser_receives_mailrule(
|
||||
self,
|
||||
mock_consumer_declaration_send: mock.Mock,
|
||||
mock_get_parser_registry: mock.Mock,
|
||||
mock_mail_parser_parse: mock.Mock,
|
||||
mock_mailrule_get: mock.Mock,
|
||||
) -> None:
|
||||
@@ -1106,18 +1098,11 @@ class TestConsumer(
|
||||
THEN:
|
||||
- The mail parser should receive the mail rule
|
||||
"""
|
||||
from paperless_mail.signals import get_parser as mail_get_parser
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
mock_consumer_declaration_send.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": mail_get_parser,
|
||||
"mime_types": {"message/rfc822": ".eml"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
mock_get_parser_registry.return_value.get_parser_for_file.return_value = (
|
||||
MailDocumentParser
|
||||
)
|
||||
mock_mailrule_get.return_value = mock.Mock(
|
||||
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
|
||||
)
|
||||
|
||||
@@ -1,132 +1,16 @@
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
from django.apps import apps
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.parsers import get_default_file_extension
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import get_supported_file_extensions
|
||||
from documents.parsers import is_file_ext_supported
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.parsers.registry import reset_parser_registry
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
|
||||
class TestParserDiscovery(TestCase):
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_1_parser(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Parser declared for a given mimetype
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- Declared parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_n_parsers(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Two parsers declared for a given mimetype
|
||||
- Second parser has a higher weight
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- Second parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser1:
|
||||
pass
|
||||
|
||||
class DummyParser2:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser1,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 1,
|
||||
"parser": DummyParser2,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
get_parser_class_for_mime_type("application/pdf"),
|
||||
DummyParser2,
|
||||
)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_0_parsers(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- No parsers are declared
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- No parser class is returned
|
||||
"""
|
||||
m.return_value = []
|
||||
with TemporaryDirectory():
|
||||
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_no_valid_parser(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- No parser declared for a given mimetype
|
||||
- Parser declared for a different mimetype
|
||||
WHEN:
|
||||
- Attempt to get parser for the given mimetype
|
||||
THEN:
|
||||
- No parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
|
||||
|
||||
|
||||
class TestParserAvailability(TestCase):
|
||||
def test_tesseract_parser(self) -> None:
|
||||
"""
|
||||
@@ -151,7 +35,7 @@ class TestParserAvailability(TestCase):
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||
RasterisedDocumentParser,
|
||||
)
|
||||
|
||||
@@ -175,7 +59,7 @@ class TestParserAvailability(TestCase):
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||
TextDocumentParser,
|
||||
)
|
||||
|
||||
@@ -198,22 +82,23 @@ class TestParserAvailability(TestCase):
|
||||
),
|
||||
]
|
||||
|
||||
# Force the app ready to notice the settings override
|
||||
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
|
||||
app = apps.get_app_config("paperless_tika")
|
||||
app.ready()
|
||||
self.addCleanup(reset_parser_registry)
|
||||
|
||||
# Reset and rebuild the registry with Tika enabled.
|
||||
with override_settings(TIKA_ENABLED=True):
|
||||
reset_parser_registry()
|
||||
supported_exts = get_supported_file_extensions()
|
||||
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
TikaDocumentParser,
|
||||
)
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||
TikaDocumentParser,
|
||||
)
|
||||
|
||||
def test_no_parser_for_mime(self) -> None:
|
||||
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
||||
self.assertIsNone(get_parser_registry().get_parser_for_file("text/sdgsdf", ""))
|
||||
|
||||
def test_default_extension(self) -> None:
|
||||
# Test no parser declared still returns a an extension
|
||||
|
||||
@@ -7,7 +7,6 @@ import tempfile
|
||||
import zipfile
|
||||
from collections import defaultdict
|
||||
from collections import deque
|
||||
from contextlib import nullcontext
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from time import mktime
|
||||
@@ -159,7 +158,6 @@ from documents.models import UiSettings
|
||||
from documents.models import Workflow
|
||||
from documents.models import WorkflowAction
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.permissions import AcknowledgeTasksPermissions
|
||||
from documents.permissions import PaperlessAdminPermissions
|
||||
from documents.permissions import PaperlessNotePermissions
|
||||
@@ -227,7 +225,7 @@ from paperless.celery import app as celery_app
|
||||
from paperless.config import AIConfig
|
||||
from paperless.config import GeneralConfig
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.serialisers import GroupSerializer
|
||||
from paperless.serialisers import UserSerializer
|
||||
from paperless.views import StandardPagination
|
||||
@@ -1084,17 +1082,17 @@ class DocumentViewSet(
|
||||
if not Path(file).is_file():
|
||||
return None
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
parser_class = get_parser_registry().get_parser_for_file(
|
||||
mime_type,
|
||||
Path(file).name,
|
||||
Path(file),
|
||||
)
|
||||
if parser_class:
|
||||
parser = parser_class(progress_callback=None, logging_group=None)
|
||||
cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
|
||||
|
||||
try:
|
||||
with cm:
|
||||
with parser_class() as parser:
|
||||
return parser.extract_metadata(file, mime_type)
|
||||
except Exception: # pragma: no cover
|
||||
logger.exception(f"Issue getting metadata for {file}")
|
||||
# TODO: cover GPG errors, remove later.
|
||||
return []
|
||||
else: # pragma: no cover
|
||||
logger.warning(f"No parser for {mime_type}")
|
||||
|
||||
@@ -2,7 +2,7 @@ msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: paperless-ngx\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2026-03-21 09:25+0000\n"
|
||||
"POT-Creation-Date: 2026-03-22 13:54+0000\n"
|
||||
"PO-Revision-Date: 2022-02-17 04:17\n"
|
||||
"Last-Translator: \n"
|
||||
"Language-Team: English\n"
|
||||
@@ -1300,7 +1300,7 @@ msgid "workflow runs"
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:463 documents/serialisers.py:815
|
||||
#: documents/serialisers.py:2501 documents/views.py:1992
|
||||
#: documents/serialisers.py:2501 documents/views.py:1990
|
||||
#: paperless_mail/serialisers.py:143
|
||||
msgid "Insufficient permissions."
|
||||
msgstr ""
|
||||
@@ -1341,7 +1341,7 @@ msgstr ""
|
||||
msgid "Duplicate document identifiers are not allowed."
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:2587 documents/views.py:3598
|
||||
#: documents/serialisers.py:2587 documents/views.py:3596
|
||||
#, python-format
|
||||
msgid "Documents not found: %(ids)s"
|
||||
msgstr ""
|
||||
@@ -1605,24 +1605,24 @@ msgstr ""
|
||||
msgid "Unable to parse URI {value}"
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:1985
|
||||
#: documents/views.py:1983
|
||||
msgid "Invalid more_like_id"
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3610
|
||||
#: documents/views.py:3608
|
||||
#, python-format
|
||||
msgid "Insufficient permissions to share document %(id)s."
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3653
|
||||
#: documents/views.py:3651
|
||||
msgid "Bundle is already being processed."
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3710
|
||||
#: documents/views.py:3708
|
||||
msgid "The share link bundle is still being prepared. Please try again later."
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3720
|
||||
#: documents/views.py:3718
|
||||
msgid "The share link bundle is unavailable."
|
||||
msgstr ""
|
||||
|
||||
@@ -1862,151 +1862,151 @@ msgstr ""
|
||||
msgid "paperless application settings"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:521
|
||||
#: paperless/settings/__init__.py:518
|
||||
msgid "English (US)"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:522
|
||||
#: paperless/settings/__init__.py:519
|
||||
msgid "Arabic"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:523
|
||||
#: paperless/settings/__init__.py:520
|
||||
msgid "Afrikaans"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:524
|
||||
#: paperless/settings/__init__.py:521
|
||||
msgid "Belarusian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:525
|
||||
#: paperless/settings/__init__.py:522
|
||||
msgid "Bulgarian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:526
|
||||
#: paperless/settings/__init__.py:523
|
||||
msgid "Catalan"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:527
|
||||
#: paperless/settings/__init__.py:524
|
||||
msgid "Czech"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:528
|
||||
#: paperless/settings/__init__.py:525
|
||||
msgid "Danish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:529
|
||||
#: paperless/settings/__init__.py:526
|
||||
msgid "German"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:530
|
||||
#: paperless/settings/__init__.py:527
|
||||
msgid "Greek"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:531
|
||||
#: paperless/settings/__init__.py:528
|
||||
msgid "English (GB)"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:532
|
||||
#: paperless/settings/__init__.py:529
|
||||
msgid "Spanish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:533
|
||||
#: paperless/settings/__init__.py:530
|
||||
msgid "Persian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:534
|
||||
#: paperless/settings/__init__.py:531
|
||||
msgid "Finnish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:535
|
||||
#: paperless/settings/__init__.py:532
|
||||
msgid "French"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:536
|
||||
#: paperless/settings/__init__.py:533
|
||||
msgid "Hungarian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:537
|
||||
#: paperless/settings/__init__.py:534
|
||||
msgid "Indonesian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:538
|
||||
#: paperless/settings/__init__.py:535
|
||||
msgid "Italian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:539
|
||||
#: paperless/settings/__init__.py:536
|
||||
msgid "Japanese"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:540
|
||||
#: paperless/settings/__init__.py:537
|
||||
msgid "Korean"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:541
|
||||
#: paperless/settings/__init__.py:538
|
||||
msgid "Luxembourgish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:542
|
||||
#: paperless/settings/__init__.py:539
|
||||
msgid "Norwegian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:543
|
||||
#: paperless/settings/__init__.py:540
|
||||
msgid "Dutch"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:544
|
||||
#: paperless/settings/__init__.py:541
|
||||
msgid "Polish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:545
|
||||
#: paperless/settings/__init__.py:542
|
||||
msgid "Portuguese (Brazil)"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:546
|
||||
#: paperless/settings/__init__.py:543
|
||||
msgid "Portuguese"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:547
|
||||
#: paperless/settings/__init__.py:544
|
||||
msgid "Romanian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:548
|
||||
#: paperless/settings/__init__.py:545
|
||||
msgid "Russian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:549
|
||||
#: paperless/settings/__init__.py:546
|
||||
msgid "Slovak"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:550
|
||||
#: paperless/settings/__init__.py:547
|
||||
msgid "Slovenian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:551
|
||||
#: paperless/settings/__init__.py:548
|
||||
msgid "Serbian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:552
|
||||
#: paperless/settings/__init__.py:549
|
||||
msgid "Swedish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:553
|
||||
#: paperless/settings/__init__.py:550
|
||||
msgid "Turkish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:554
|
||||
#: paperless/settings/__init__.py:551
|
||||
msgid "Ukrainian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:555
|
||||
#: paperless/settings/__init__.py:552
|
||||
msgid "Vietnamese"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:556
|
||||
#: paperless/settings/__init__.py:553
|
||||
msgid "Chinese Simplified"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:557
|
||||
#: paperless/settings/__init__.py:554
|
||||
msgid "Chinese Traditional"
|
||||
msgstr ""
|
||||
|
||||
@@ -2052,7 +2052,7 @@ msgid ""
|
||||
"process all matching rules that you have defined."
|
||||
msgstr ""
|
||||
|
||||
#: paperless_mail/apps.py:11
|
||||
#: paperless_mail/apps.py:8
|
||||
msgid "Paperless mail"
|
||||
msgstr ""
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import os
|
||||
import pwd
|
||||
import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
@@ -299,3 +300,62 @@ def check_deprecated_db_settings(
|
||||
)
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
|
||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||
):
|
||||
return [
|
||||
Error(
|
||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
||||
),
|
||||
]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def get_tesseract_langs():
|
||||
proc = subprocess.run(
|
||||
[shutil.which("tesseract"), "--list-langs"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Decode bytes to string, split on newlines, trim out the header
|
||||
proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
|
||||
|
||||
return [x.strip() for x in proc_lines]
|
||||
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
errs = []
|
||||
|
||||
if not settings.OCR_LANGUAGE:
|
||||
errs.append(
|
||||
Warning(
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
||||
"This means that tesseract will fallback to english.",
|
||||
),
|
||||
)
|
||||
return errs
|
||||
|
||||
# binaries_check in paperless will check and report if this doesn't exist
|
||||
# So skip trying to do anything here and let that handle missing binaries
|
||||
if shutil.which("tesseract") is not None:
|
||||
installed_langs = get_tesseract_langs()
|
||||
|
||||
specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")]
|
||||
|
||||
for lang in specified_langs:
|
||||
if lang not in installed_langs:
|
||||
errs.append(
|
||||
Error(
|
||||
f"The selected ocr language {lang} is "
|
||||
f"not installed. Paperless cannot OCR your documents "
|
||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
|
||||
),
|
||||
)
|
||||
|
||||
return errs
|
||||
|
||||
@@ -33,6 +33,7 @@ name, version, author, url, supported_mime_types (callable), score (callable).
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from importlib.metadata import entry_points
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -49,6 +50,7 @@ logger = logging.getLogger("paperless.parsers.registry")
|
||||
|
||||
_registry: ParserRegistry | None = None
|
||||
_discovery_complete: bool = False
|
||||
_lock = threading.Lock()
|
||||
|
||||
# Attribute names that every registered external parser class must expose.
|
||||
_REQUIRED_ATTRS: tuple[str, ...] = (
|
||||
@@ -74,7 +76,6 @@ def get_parser_registry() -> ParserRegistry:
|
||||
1. Creates a new ParserRegistry.
|
||||
2. Calls register_defaults to install built-in parsers.
|
||||
3. Calls discover to load third-party plugins via importlib.metadata entrypoints.
|
||||
4. Calls log_summary to emit a startup summary.
|
||||
|
||||
Subsequent calls return the same instance immediately.
|
||||
|
||||
@@ -85,14 +86,15 @@ def get_parser_registry() -> ParserRegistry:
|
||||
"""
|
||||
global _registry, _discovery_complete
|
||||
|
||||
if _registry is None:
|
||||
_registry = ParserRegistry()
|
||||
_registry.register_defaults()
|
||||
with _lock:
|
||||
if _registry is None:
|
||||
r = ParserRegistry()
|
||||
r.register_defaults()
|
||||
_registry = r
|
||||
|
||||
if not _discovery_complete:
|
||||
_registry.discover()
|
||||
_registry.log_summary()
|
||||
_discovery_complete = True
|
||||
if not _discovery_complete:
|
||||
_registry.discover()
|
||||
_discovery_complete = True
|
||||
|
||||
return _registry
|
||||
|
||||
@@ -113,9 +115,11 @@ def init_builtin_parsers() -> None:
|
||||
"""
|
||||
global _registry
|
||||
|
||||
if _registry is None:
|
||||
_registry = ParserRegistry()
|
||||
_registry.register_defaults()
|
||||
with _lock:
|
||||
if _registry is None:
|
||||
r = ParserRegistry()
|
||||
r.register_defaults()
|
||||
_registry = r
|
||||
|
||||
|
||||
def reset_parser_registry() -> None:
|
||||
@@ -304,6 +308,23 @@ class ParserRegistry:
|
||||
getattr(cls, "url", "unknown"),
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Inspection helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def all_parsers(self) -> list[type[ParserProtocol]]:
|
||||
"""Return all registered parser classes (external first, then builtins).
|
||||
|
||||
Used by compatibility wrappers that need to iterate every parser to
|
||||
compute the full set of supported MIME types and file extensions.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[type[ParserProtocol]]
|
||||
External parsers followed by built-in parsers.
|
||||
"""
|
||||
return [*self._external, *self._builtins]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Parser resolution
|
||||
# ------------------------------------------------------------------
|
||||
@@ -334,7 +355,7 @@ class ParserRegistry:
|
||||
mime_type:
|
||||
The detected MIME type of the file.
|
||||
filename:
|
||||
The original filename, including extension.
|
||||
The original filename, including extension. May be empty in some cases
|
||||
path:
|
||||
Optional filesystem path to the file. Forwarded to each
|
||||
parser's score method.
|
||||
|
||||
@@ -121,10 +121,7 @@ INSTALLED_APPS = [
|
||||
"django_extensions",
|
||||
"paperless",
|
||||
"documents.apps.DocumentsConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
"paperless_remote.apps.PaperlessRemoteParserConfig",
|
||||
"django.contrib.admin",
|
||||
"rest_framework",
|
||||
"rest_framework.authtoken",
|
||||
@@ -974,8 +971,8 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
||||
"http://localhost:3000",
|
||||
)
|
||||
|
||||
if TIKA_ENABLED:
|
||||
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
||||
# Tika parser is now integrated into the main parser registry
|
||||
# No separate Django app needed
|
||||
|
||||
AUDIT_LOG_ENABLED = get_bool_from_env("PAPERLESS_AUDIT_LOG_ENABLED", "true")
|
||||
if AUDIT_LOG_ENABLED:
|
||||
|
||||
@@ -90,35 +90,6 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def remote_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the remote parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/remote/``
|
||||
"""
|
||||
return samples_dir / "remote"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_pdf_file(remote_samples_dir: Path) -> Path:
|
||||
"""Path to a simple digital PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``remote/simple-digital.pdf``.
|
||||
"""
|
||||
return remote_samples_dir / "simple-digital.pdf"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser instance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@@ -277,20 +277,20 @@ class TestRemoteParserParse:
|
||||
def test_parse_returns_text_from_azure(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() == _DEFAULT_TEXT
|
||||
|
||||
def test_parse_sets_archive_path(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
archive = remote_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
@@ -300,11 +300,11 @@ class TestRemoteParserParse:
|
||||
def test_parse_closes_client_on_success(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.configure(ParserContext())
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
azure_client.close.assert_called_once()
|
||||
|
||||
@@ -312,9 +312,9 @@ class TestRemoteParserParse:
|
||||
def test_parse_sets_empty_text_when_not_configured(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() == ""
|
||||
assert remote_parser.get_archive_path() is None
|
||||
@@ -328,10 +328,10 @@ class TestRemoteParserParse:
|
||||
def test_get_date_always_none(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_date() is None
|
||||
|
||||
@@ -345,33 +345,33 @@ class TestRemoteParserParseError:
|
||||
def test_parse_returns_none_on_azure_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() is None
|
||||
|
||||
def test_parse_closes_client_on_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
failing_azure_client.close.assert_called_once()
|
||||
|
||||
def test_parse_logs_error_on_azure_failure(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
mock_log = mocker.patch("paperless.parsers.remote.logger")
|
||||
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
mock_log.error.assert_called_once()
|
||||
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
|
||||
@@ -386,18 +386,18 @@ class TestRemoteParserPageCount:
|
||||
def test_page_count_for_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||
count = remote_parser.get_page_count(simple_digital_pdf_file, "application/pdf")
|
||||
assert isinstance(count, int)
|
||||
assert count >= 1
|
||||
|
||||
def test_page_count_returns_none_for_image_mime(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
count = remote_parser.get_page_count(sample_pdf_file, "image/png")
|
||||
count = remote_parser.get_page_count(simple_digital_pdf_file, "image/png")
|
||||
assert count is None
|
||||
|
||||
def test_page_count_returns_none_for_invalid_pdf(
|
||||
@@ -420,25 +420,31 @@ class TestRemoteParserMetadata:
|
||||
def test_extract_metadata_non_pdf_returns_empty(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
|
||||
result = remote_parser.extract_metadata(simple_digital_pdf_file, "image/png")
|
||||
assert result == []
|
||||
|
||||
def test_extract_metadata_pdf_returns_list(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||
result = remote_parser.extract_metadata(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
)
|
||||
assert isinstance(result, list)
|
||||
|
||||
def test_extract_metadata_pdf_entries_have_required_keys(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||
result = remote_parser.extract_metadata(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
)
|
||||
for entry in result:
|
||||
assert "namespace" in entry
|
||||
assert "prefix" in entry
|
||||
|
||||
@@ -77,10 +77,10 @@ class TestTikaParserRegistryInterface:
|
||||
def test_get_page_count_returns_int_with_pdf_archive(
|
||||
self,
|
||||
tika_parser: TikaDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
tika_parser._archive_path = sample_pdf_file
|
||||
count = tika_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||
tika_parser._archive_path = simple_digital_pdf_file
|
||||
count = tika_parser.get_page_count(simple_digital_pdf_file, "application/pdf")
|
||||
assert isinstance(count, int)
|
||||
assert count > 0
|
||||
|
||||
|
||||
Binary file not shown.
@@ -5,6 +5,7 @@ from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from django.core.checks import ERROR
|
||||
from django.core.checks import Error
|
||||
from django.core.checks import Warning
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
@@ -12,7 +13,9 @@ from pytest_mock import MockerFixture
|
||||
|
||||
from paperless.checks import audit_log_check
|
||||
from paperless.checks import binaries_check
|
||||
from paperless.checks import check_default_language_available
|
||||
from paperless.checks import check_deprecated_db_settings
|
||||
from paperless.checks import check_remote_parser_configured
|
||||
from paperless.checks import check_v3_minimum_upgrade_version
|
||||
from paperless.checks import debug_mode_check
|
||||
from paperless.checks import paths_check
|
||||
@@ -626,3 +629,116 @@ class TestV3MinimumUpgradeVersionCheck:
|
||||
conn.introspection.table_names.side_effect = OperationalError("DB unavailable")
|
||||
mocker.patch.dict("paperless.checks.connections", {"default": conn})
|
||||
assert check_v3_minimum_upgrade_version(None) == []
|
||||
|
||||
|
||||
class TestRemoteParserChecks:
|
||||
def test_no_engine(self, settings: SettingsWrapper) -> None:
|
||||
settings.REMOTE_OCR_ENGINE = None
|
||||
msgs = check_remote_parser_configured(None)
|
||||
|
||||
assert len(msgs) == 0
|
||||
|
||||
def test_azure_no_endpoint(self, settings: SettingsWrapper) -> None:
|
||||
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = "somekey"
|
||||
settings.REMOTE_OCR_ENDPOINT = None
|
||||
|
||||
msgs = check_remote_parser_configured(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
|
||||
msg = msgs[0]
|
||||
|
||||
assert (
|
||||
"Azure AI remote parser requires endpoint and API key to be configured."
|
||||
in msg.msg
|
||||
)
|
||||
|
||||
|
||||
class TestTesseractChecks:
|
||||
def test_default_language(self) -> None:
|
||||
check_default_language_available(None)
|
||||
|
||||
def test_no_language(self, settings: SettingsWrapper) -> None:
|
||||
|
||||
settings.OCR_LANGUAGE = ""
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
msg = msgs[0]
|
||||
|
||||
assert (
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE" in msg.msg
|
||||
)
|
||||
|
||||
def test_invalid_language(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
|
||||
settings.OCR_LANGUAGE = "ita"
|
||||
|
||||
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||
tesser_lang_mock.return_value = ["deu", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
msg = msgs[0]
|
||||
|
||||
assert msg.level == ERROR
|
||||
assert "The selected ocr language ita is not installed" in msg.msg
|
||||
|
||||
def test_multi_part_language(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- An OCR language which is multi part (ie chi-sim)
|
||||
- The language is correctly formatted
|
||||
WHEN:
|
||||
- Installed packages are checked
|
||||
THEN:
|
||||
- No errors are reported
|
||||
"""
|
||||
|
||||
settings.OCR_LANGUAGE = "chi_sim"
|
||||
|
||||
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||
tesser_lang_mock.return_value = ["chi_sim", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 0
|
||||
|
||||
def test_multi_part_language_bad_format(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- An OCR language which is multi part (ie chi-sim)
|
||||
- The language is correctly NOT formatted
|
||||
WHEN:
|
||||
- Installed packages are checked
|
||||
THEN:
|
||||
- No errors are reported
|
||||
"""
|
||||
settings.OCR_LANGUAGE = "chi-sim"
|
||||
|
||||
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||
tesser_lang_mock.return_value = ["chi_sim", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
msg = msgs[0]
|
||||
|
||||
assert msg.level == ERROR
|
||||
assert "The selected ocr language chi-sim is not installed" in msg.msg
|
||||
|
||||
@@ -1,18 +1,8 @@
|
||||
from django.apps import AppConfig
|
||||
from django.conf import settings
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from paperless_mail.signals import mail_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessMailConfig(AppConfig):
|
||||
name = "paperless_mail"
|
||||
|
||||
verbose_name = _("Paperless mail")
|
||||
|
||||
def ready(self) -> None:
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
if settings.TIKA_ENABLED:
|
||||
document_consumer_declaration.connect(mail_consumer_declaration)
|
||||
AppConfig.ready(self)
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
# MailDocumentParser accepts no constructor args in the new-style protocol.
|
||||
# Pop legacy args that arrive from the signal-based consumer path.
|
||||
# Phase 4 will replace this signal path with the ParserRegistry.
|
||||
kwargs.pop("logging_group", None)
|
||||
kwargs.pop("progress_callback", None)
|
||||
return MailDocumentParser()
|
||||
|
||||
|
||||
def mail_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 20,
|
||||
"mime_types": {
|
||||
"message/rfc822": ".eml",
|
||||
},
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
# this is here so that django finds the checks.
|
||||
from paperless_remote.checks import check_remote_parser_configured
|
||||
|
||||
__all__ = ["check_remote_parser_configured"]
|
||||
@@ -1,14 +0,0 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_remote.signals import remote_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessRemoteParserConfig(AppConfig):
|
||||
name = "paperless_remote"
|
||||
|
||||
def ready(self) -> None:
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
document_consumer_declaration.connect(remote_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
||||
@@ -1,17 +0,0 @@
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error
|
||||
from django.core.checks import register
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs, **kwargs):
|
||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||
):
|
||||
return [
|
||||
Error(
|
||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
||||
),
|
||||
]
|
||||
|
||||
return []
|
||||
@@ -1,38 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
|
||||
# The new RemoteDocumentParser does not accept the progress_callback
|
||||
# kwarg injected by the old signal-based consumer. logging_group is
|
||||
# forwarded as a positional arg.
|
||||
# Phase 4 will replace this signal path with the new ParserRegistry.
|
||||
kwargs.pop("progress_callback", None)
|
||||
return RemoteDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def get_supported_mime_types() -> dict[str, str]:
|
||||
from django.conf import settings
|
||||
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.remote import RemoteEngineConfig
|
||||
|
||||
config = RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
if not config.engine_is_valid():
|
||||
return {}
|
||||
return RemoteDocumentParser.supported_mime_types()
|
||||
|
||||
|
||||
def remote_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 5,
|
||||
"mime_types": get_supported_mime_types(),
|
||||
}
|
||||
@@ -1,24 +0,0 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from django.test import override_settings
|
||||
|
||||
from paperless_remote import check_remote_parser_configured
|
||||
|
||||
|
||||
class TestChecks(TestCase):
|
||||
@override_settings(REMOTE_OCR_ENGINE=None)
|
||||
def test_no_engine(self) -> None:
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 0)
|
||||
|
||||
@override_settings(REMOTE_OCR_ENGINE="azureai")
|
||||
@override_settings(REMOTE_OCR_API_KEY="somekey")
|
||||
@override_settings(REMOTE_OCR_ENDPOINT=None)
|
||||
def test_azure_no_endpoint(self) -> None:
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 1)
|
||||
self.assertTrue(
|
||||
msgs[0].msg.startswith(
|
||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
||||
),
|
||||
)
|
||||
@@ -1,5 +0,0 @@
|
||||
# this is here so that django finds the checks.
|
||||
from paperless_tesseract.checks import check_default_language_available
|
||||
from paperless_tesseract.checks import get_tesseract_langs
|
||||
|
||||
__all__ = ["check_default_language_available", "get_tesseract_langs"]
|
||||
@@ -1,14 +0,0 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_tesseract.signals import tesseract_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessTesseractConfig(AppConfig):
|
||||
name = "paperless_tesseract"
|
||||
|
||||
def ready(self) -> None:
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
document_consumer_declaration.connect(tesseract_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
||||
@@ -1,52 +0,0 @@
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error
|
||||
from django.core.checks import Warning
|
||||
from django.core.checks import register
|
||||
|
||||
|
||||
def get_tesseract_langs():
|
||||
proc = subprocess.run(
|
||||
[shutil.which("tesseract"), "--list-langs"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Decode bytes to string, split on newlines, trim out the header
|
||||
proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
|
||||
|
||||
return [x.strip() for x in proc_lines]
|
||||
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
errs = []
|
||||
|
||||
if not settings.OCR_LANGUAGE:
|
||||
errs.append(
|
||||
Warning(
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
||||
"This means that tesseract will fallback to english.",
|
||||
),
|
||||
)
|
||||
return errs
|
||||
|
||||
# binaries_check in paperless will check and report if this doesn't exist
|
||||
# So skip trying to do anything here and let that handle missing binaries
|
||||
if shutil.which("tesseract") is not None:
|
||||
installed_langs = get_tesseract_langs()
|
||||
|
||||
specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")]
|
||||
|
||||
for lang in specified_langs:
|
||||
if lang not in installed_langs:
|
||||
errs.append(
|
||||
Error(
|
||||
f"The selected ocr language {lang} is "
|
||||
f"not installed. Paperless cannot OCR your documents "
|
||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
|
||||
),
|
||||
)
|
||||
|
||||
return errs
|
||||
@@ -1,34 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
|
||||
# RasterisedDocumentParser accepts logging_group for constructor compatibility but
|
||||
# does not store or use it (no legacy DocumentParser base class).
|
||||
# progress_callback is also not used. Both may arrive as a positional arg
|
||||
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
||||
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||
kwargs.pop("logging_group", None)
|
||||
kwargs.pop("progress_callback", None)
|
||||
return RasterisedDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def tesseract_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 0,
|
||||
"mime_types": {
|
||||
"application/pdf": ".pdf",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/tiff": ".tif",
|
||||
"image/gif": ".gif",
|
||||
"image/bmp": ".bmp",
|
||||
"image/webp": ".webp",
|
||||
"image/heic": ".heic",
|
||||
},
|
||||
}
|
||||
@@ -1,67 +0,0 @@
|
||||
from unittest import mock
|
||||
|
||||
from django.core.checks import ERROR
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from paperless_tesseract import check_default_language_available
|
||||
|
||||
|
||||
class TestChecks(TestCase):
|
||||
def test_default_language(self) -> None:
|
||||
check_default_language_available(None)
|
||||
|
||||
@override_settings(OCR_LANGUAGE="")
|
||||
def test_no_language(self) -> None:
|
||||
msgs = check_default_language_available(None)
|
||||
self.assertEqual(len(msgs), 1)
|
||||
self.assertTrue(
|
||||
msgs[0].msg.startswith(
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE",
|
||||
),
|
||||
)
|
||||
|
||||
@override_settings(OCR_LANGUAGE="ita")
|
||||
@mock.patch("paperless_tesseract.checks.get_tesseract_langs")
|
||||
def test_invalid_language(self, m) -> None:
|
||||
m.return_value = ["deu", "eng"]
|
||||
msgs = check_default_language_available(None)
|
||||
self.assertEqual(len(msgs), 1)
|
||||
self.assertEqual(msgs[0].level, ERROR)
|
||||
|
||||
@override_settings(OCR_LANGUAGE="chi_sim")
|
||||
@mock.patch("paperless_tesseract.checks.get_tesseract_langs")
|
||||
def test_multi_part_language(self, m) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- An OCR language which is multi part (ie chi-sim)
|
||||
- The language is correctly formatted
|
||||
WHEN:
|
||||
- Installed packages are checked
|
||||
THEN:
|
||||
- No errors are reported
|
||||
"""
|
||||
m.return_value = ["chi_sim", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
self.assertEqual(len(msgs), 0)
|
||||
|
||||
@override_settings(OCR_LANGUAGE="chi-sim")
|
||||
@mock.patch("paperless_tesseract.checks.get_tesseract_langs")
|
||||
def test_multi_part_language_bad_format(self, m) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- An OCR language which is multi part (ie chi-sim)
|
||||
- The language is correctly NOT formatted
|
||||
WHEN:
|
||||
- Installed packages are checked
|
||||
THEN:
|
||||
- No errors are reported
|
||||
"""
|
||||
m.return_value = ["chi_sim", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
self.assertEqual(len(msgs), 1)
|
||||
self.assertEqual(msgs[0].level, ERROR)
|
||||
@@ -1,14 +0,0 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_text.signals import text_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessTextConfig(AppConfig):
|
||||
name = "paperless_text"
|
||||
|
||||
def ready(self) -> None:
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
document_consumer_declaration.connect(text_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
||||
@@ -1,29 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
# TextDocumentParser accepts logging_group for constructor compatibility but
|
||||
# does not store or use it (no legacy DocumentParser base class).
|
||||
# progress_callback is also not used. Both may arrive as a positional arg
|
||||
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
||||
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||
kwargs.pop("logging_group", None)
|
||||
kwargs.pop("progress_callback", None)
|
||||
return TextDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def text_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 10,
|
||||
"mime_types": {
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv",
|
||||
"application/csv": ".csv",
|
||||
},
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
from django.apps import AppConfig
|
||||
from django.conf import settings
|
||||
|
||||
from paperless_tika.signals import tika_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessTikaConfig(AppConfig):
|
||||
name = "paperless_tika"
|
||||
|
||||
def ready(self) -> None:
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
if settings.TIKA_ENABLED:
|
||||
document_consumer_declaration.connect(tika_consumer_declaration)
|
||||
AppConfig.ready(self)
|
||||
@@ -1,33 +0,0 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
# TikaDocumentParser accepts logging_group for constructor compatibility but
|
||||
# does not store or use it (no legacy DocumentParser base class).
|
||||
# progress_callback is also not used. Both may arrive as a positional arg
|
||||
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
||||
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||
kwargs.pop("logging_group", None)
|
||||
kwargs.pop("progress_callback", None)
|
||||
return TikaDocumentParser()
|
||||
|
||||
|
||||
def tika_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 10,
|
||||
"mime_types": {
|
||||
"application/msword": ".doc",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||
"application/vnd.ms-excel": ".xls",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
||||
"application/vnd.ms-powerpoint": ".ppt",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
|
||||
"application/vnd.oasis.opendocument.presentation": ".odp",
|
||||
"application/vnd.oasis.opendocument.spreadsheet": ".ods",
|
||||
"application/vnd.oasis.opendocument.text": ".odt",
|
||||
"application/vnd.oasis.opendocument.graphics": ".odg",
|
||||
"text/rtf": ".rtf",
|
||||
},
|
||||
}
|
||||
46
uv.lock
generated
46
uv.lock
generated
@@ -361,31 +361,31 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "cbor2"
|
||||
version = "5.8.0"
|
||||
version = "5.9.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d9/8e/8b4fdde28e42ffcd741a37f4ffa9fb59cd4fe01625b544dfcfd9ccb54f01/cbor2-5.8.0.tar.gz", hash = "sha256:b19c35fcae9688ac01ef75bad5db27300c2537eb4ee00ed07e05d8456a0d4931", size = 107825, upload-time = "2025-12-30T18:44:22.455Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/bd/cb/09939728be094d155b5d4ac262e39877875f5f7e36eea66beb359f647bd0/cbor2-5.9.0.tar.gz", hash = "sha256:85c7a46279ac8f226e1059275221e6b3d0e370d2bb6bd0500f9780781615bcea", size = 111231, upload-time = "2026-03-22T15:56:50.638Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/88/4b/623435ef9b98e86b6956a41863d39ff4fe4d67983948b5834f55499681dd/cbor2-5.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:18ac191640093e6c7fbcb174c006ffec4106c3d8ab788e70272c1c4d933cbe11", size = 69875, upload-time = "2025-12-30T18:43:35.888Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/58/17/f664201080b2a7d0f57c16c8e9e5922013b92f202e294863ec7e75b7ff7f/cbor2-5.8.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fddee9103a17d7bed5753f0c7fc6663faa506eb953e50d8287804eccf7b048e6", size = 268316, upload-time = "2025-12-30T18:43:37.161Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d0/e1/072745b4ff01afe9df2cd627f8fc51a1acedb5d3d1253765625d2929db91/cbor2-5.8.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8d2ea26fad620aba5e88d7541be8b10c5034a55db9a23809b7cb49f36803f05b", size = 258874, upload-time = "2025-12-30T18:43:38.878Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a7/10/61c262b886d22b62c56e8aac6d10fa06d0953c997879ab882a31a624952b/cbor2-5.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:de68b4b310b072b082d317adc4c5e6910173a6d9455412e6183d72c778d1f54c", size = 261971, upload-time = "2025-12-30T18:43:40.401Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7e/42/b7862f5e64364b10ad120ea53e87ec7e891fb268cb99c572348e647cf7e9/cbor2-5.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:418d2cf0e03e90160fa1474c05a40fe228bbb4a92d1628bdbbd13a48527cb34d", size = 254151, upload-time = "2025-12-30T18:43:41.938Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/4f/3a16e3e8fd7e5fd86751a4f1aad218a8d19a96e75ec3989c3e95a8fe1d8f/cbor2-5.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4b3f91fa699a5ce22470e973601c62dd9d55dc3ca20ee446516ac075fcab27c9", size = 70270, upload-time = "2025-12-30T18:43:46.005Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/38/81/0d0cf0796fe8081492a61c45278f03def21a929535a492dd97c8438f5dbe/cbor2-5.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:518c118a5e00001854adb51f3164e647aa99b6a9877d2a733a28cb5c0a4d6857", size = 286242, upload-time = "2025-12-30T18:43:47.026Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7b/a9/fdab6c10190cfb8d639e01f2b168f2406fc847a2a6bc00e7de78c3381d0a/cbor2-5.8.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cff2a1999e49cd51c23d1b6786a012127fd8f722c5946e82bd7ab3eb307443f3", size = 285412, upload-time = "2025-12-30T18:43:48.563Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/31/59/746a8e630996217a3afd523f583fcf7e3d16640d63f9a03f0f4e4f74b5b1/cbor2-5.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4c4492160212374973cdc14e46f0565f2462721ef922b40f7ea11e7d613dfb2a", size = 278041, upload-time = "2025-12-30T18:43:49.92Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0f/a3/f3bbeb6dedd45c6e0cddd627ea790dea295eaf82c83f0e2159b733365ebd/cbor2-5.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:546c7c7c4c6bcdc54a59242e0e82cea8f332b17b4465ae628718fef1fce401ca", size = 278185, upload-time = "2025-12-30T18:43:51.192Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a6/0d/5a3f20bafaefeb2c1903d961416f051c0950f0d09e7297a3aa6941596b29/cbor2-5.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6d8d104480845e2f28c6165b4c961bbe58d08cb5638f368375cfcae051c28015", size = 70332, upload-time = "2025-12-30T18:43:54.694Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/57/66/177a3f089e69db69c987453ab4934086408c3338551e4984734597be9f80/cbor2-5.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:43efee947e5ab67d406d6e0dc61b5dee9d2f5e89ae176f90677a3741a20ca2e7", size = 285985, upload-time = "2025-12-30T18:43:55.733Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b7/8e/9e17b8e4ed80a2ce97e2dfa5915c169dbb31599409ddb830f514b57f96cc/cbor2-5.8.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:be7ae582f50be539e09c134966d0fd63723fc4789b8dff1f6c2e3f24ae3eaf32", size = 285173, upload-time = "2025-12-30T18:43:57.321Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/33/9f92e107d78f88ac22723ac15d0259d220ba98c1d855e51796317f4c4114/cbor2-5.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:50f5c709561a71ea7970b4cd2bf9eda4eccacc0aac212577080fdfe64183e7f5", size = 278395, upload-time = "2025-12-30T18:43:58.497Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/3f/46b80050a4a35ce5cf7903693864a9fdea7213567dc8faa6e25cb375c182/cbor2-5.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6790ecc73aa93e76d2d9076fc42bf91a9e69f2295e5fa702e776dbe986465bd", size = 278330, upload-time = "2025-12-30T18:43:59.656Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4b/0c/0654233d7543ac8a50f4785f172430ddc97538ba418eb305d6e529d1a120/cbor2-5.8.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ad72381477133046ce217617d839ea4e9454f8b77d9a6351b229e214102daeb7", size = 70710, upload-time = "2025-12-30T18:44:03.209Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/84/62/4671d24e557d7f5a74a01b422c538925140c0495e57decde7e566f91d029/cbor2-5.8.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6da25190fad3434ce99876b11d4ca6b8828df6ca232cf7344cd14ae1166fb718", size = 285005, upload-time = "2025-12-30T18:44:05.109Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/87/85/0c67d763a08e848c9a80d7e4723ba497cce676f41bc7ca1828ae90a0a872/cbor2-5.8.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c13919e3a24c5a6d286551fa288848a4cedc3e507c58a722ccd134e461217d99", size = 282435, upload-time = "2025-12-30T18:44:06.465Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b2/01/0650972b4dbfbebcfbe37cbba7fc3cd9019a8da6397ab3446e07175e342b/cbor2-5.8.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f8c40d32e5972047a777f9bf730870828f3cf1c43b3eb96fd0429c57a1d3b9e6", size = 277493, upload-time = "2025-12-30T18:44:07.609Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/6c/7704a4f32adc7f10f3b41ec067f500a4458f7606397af5e4cf2d368fd288/cbor2-5.8.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7627894bc0b3d5d0807f31e3107e11b996205470c4429dc2bb4ef8bfe7f64e1e", size = 276085, upload-time = "2025-12-30T18:44:09.021Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d6/4f/101071f880b4da05771128c0b89f41e334cff044dee05fb013c8f4be661c/cbor2-5.8.0-py3-none-any.whl", hash = "sha256:3727d80f539567b03a7aa11890e57798c67092c38df9e6c23abb059e0f65069c", size = 24374, upload-time = "2025-12-30T18:44:21.476Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/43/aa/317c7118b8dda4c9563125c1a12c70c5b41e36677964a49c72b1aac061ec/cbor2-5.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0485d3372fc832c5e16d4eb45fa1a20fc53e806e6c29a1d2b0d3e176cedd52b9", size = 70578, upload-time = "2026-03-22T15:56:03.835Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/31/43/fe29b1f897770011a5e7497f4523c2712282ee4a6cbf775ea6383fb7afb9/cbor2-5.9.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a9d6e4e0f988b0e766509a8071975a8ee99f930e14a524620bf38083106158d2", size = 268738, upload-time = "2026-03-22T15:56:05.222Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0a/1a/e494568f3d8aafbcdfe361df44c3bcf5cdab5183e25ea08e3d3f9fcf4075/cbor2-5.9.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5326336f633cc89dfe543c78829c16c3a6449c2c03277d1ddba99086c3323363", size = 262571, upload-time = "2026-03-22T15:56:06.411Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/42/2e/92acd6f87382fd44a34d9d7e85cc45372e6ba664040b72d1d9df648b25d0/cbor2-5.9.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5e702b02d42a5ace45425b595ffe70fe35aebaf9a3cdfdc2c758b6189c744422", size = 262356, upload-time = "2026-03-22T15:56:08.236Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3f/68/52c039a28688baeeb78b0be7483855e6c66ea05884a937444deede0c87b8/cbor2-5.9.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2372d357d403e7912f104ff085950ffc82a5854d6d717f1ca1ce16a40a0ef5a7", size = 257604, upload-time = "2026-03-22T15:56:09.835Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/39/72d8a5a4b06565561ec28f4fcb41aff7bb77f51705c01f00b8254a2aca4f/cbor2-5.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1f223dffb1bcdd2764665f04c1152943d9daa4bc124a576cd8dee1cad4264313", size = 71223, upload-time = "2026-03-22T15:56:13.68Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/09/fd/7ddf3d3153b54c69c3be77172b8d9aa3a9d74f62a7fbde614d53eaeed9a4/cbor2-5.9.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ae6c706ac1d85a0b3cb3395308fd0c4d55e3202b4760773675957e93cdff45fc", size = 287865, upload-time = "2026-03-22T15:56:14.813Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/db/9d/7ede2cc42f9bb4260492e7d29d2aab781eacbbcfb09d983de1e695077199/cbor2-5.9.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4cd43d8fc374b31643b2830910f28177a606a7bc84975a62675dd3f2e320fc7b", size = 288246, upload-time = "2026-03-22T15:56:16.113Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/9d/588ebc7c5bc5843f609b05fe07be8575c7dec987735b0bbc908ac9c1264a/cbor2-5.9.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4aa07b392cc3d76fb31c08a46a226b58c320d1c172ff3073e864409ced7bc50f", size = 280214, upload-time = "2026-03-22T15:56:17.519Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f7/a1/6fc8f4b15c6a27e7fbb7966c30c2b4b18c274a3221fa2f5e6235502d34bc/cbor2-5.9.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:971d425b3a23b75953d8853d5f9911bdeefa09d759ee3b5e6b07b5ff3cbd9073", size = 282162, upload-time = "2026-03-22T15:56:18.975Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/81/c5/4901e21a8afe9448fd947b11e8f383903207cd6dd0800e5f5a386838de5b/cbor2-5.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fbb06f34aa645b4deca66643bba3d400d20c15312d1fe88d429be60c1ab50f27", size = 71284, upload-time = "2026-03-22T15:56:22.836Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1b/10/df643a381aebc3f05486de4813662bc58accb640fc3275cb276a75e89694/cbor2-5.9.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac684fe195c39821fca70d18afbf748f728aefbfbf88456018d299e559b8cae0", size = 287682, upload-time = "2026-03-22T15:56:24.024Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c6/0c/8aa6b766059ae4a0ca1ec3ff96fe3823a69a7be880dba2e249f7fbe2700b/cbor2-5.9.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a54fbb32cb828c214f7f333a707e4aec61182e7efdc06ea5d9596d3ecee624a", size = 288009, upload-time = "2026-03-22T15:56:25.305Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/74/07/6236bc25c183a9cf7e8062e5dddf9eae9b0b14ebf14a58a69fe5a1e872c6/cbor2-5.9.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4753a6d1bc71054d9179557bc65740860f185095ccb401d46637fff028a5b3ec", size = 280437, upload-time = "2026-03-22T15:56:26.479Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4e/0a/84328d23c3c68874ac6497edb9b1900579a1028efa54734df3f1762bbc15/cbor2-5.9.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:380e534482b843e43442b87d8777a7bf9bed20cb7526f89b780c3400f617304b", size = 282247, upload-time = "2026-03-22T15:56:28.644Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/08/7d/9ccc36d10ef96e6038e48046ebe1ce35a1e7814da0e1e204d09e6ef09b8d/cbor2-5.9.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:23606d31ba1368bd1b6602e3020ee88fe9523ca80e8630faf6b2fc904fd84560", size = 71500, upload-time = "2026-03-22T15:56:31.876Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/70/e1/a6cca2cc72e13f00030c6a649f57ae703eb2c620806ab70c40db8eab33fa/cbor2-5.9.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0322296b9d52f55880e300ba8ba09ecf644303b99b51138bbb1c0fb644fa7c3e", size = 286953, upload-time = "2026-03-22T15:56:33.292Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/08/3c/24cd5ef488a957d90e016f200a3aad820e4c2f85edd61c9fe4523007a1ee/cbor2-5.9.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:422817286c1d0ce947fb2f7eca9212b39bddd7231e8b452e2d2cc52f15332dba", size = 285454, upload-time = "2026-03-22T15:56:34.703Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a4/35/dca96818494c0ba47cdd73e8d809b27fa91f8fa0ce32a068a09237687454/cbor2-5.9.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9a4907e0c3035bb8836116854ed8e56d8aef23909d601fa59706320897ec2551", size = 279441, upload-time = "2026-03-22T15:56:35.888Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a4/44/d3362378b16e53cf7e535a3f5aed8476e2109068154e24e31981ef5bde9e/cbor2-5.9.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fb7afe77f8d269e42d7c4b515c6fd14f1ccc0625379fb6829b269f493d16eddd", size = 279673, upload-time = "2026-03-22T15:56:37.08Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/42/ff/b83492b096fbef26e9cb62c1a4bf2d3cef579ea7b33138c6c37c4ae66f67/cbor2-5.9.0-py3-none-any.whl", hash = "sha256:27695cbd70c90b8de5c4a284642c2836449b14e2c2e07e3ffe0744cb7669a01b", size = 24627, upload-time = "2026-03-22T15:56:48.847Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
Reference in New Issue
Block a user