Compare commits
31 Commits
feature-sh
...
feature-te
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e24a2d8214 | ||
|
|
8e3dfcb4ee | ||
|
|
1b45e4d029 | ||
|
|
6b279e9368 | ||
|
|
97bc53ccdc | ||
|
|
80fa4f6f12 | ||
|
|
c5b006e666 | ||
|
|
ad1654d89b | ||
|
|
466a402715 | ||
|
|
b2e3048083 | ||
|
|
fe1e35b9ac | ||
|
|
d01513a869 | ||
|
|
9e3c93f72d | ||
|
|
f7c12d550a | ||
|
|
68fc898042 | ||
|
|
16e73f611d | ||
|
|
b66cfb1867 | ||
|
|
49e1ebb620 | ||
|
|
8148f2ced2 | ||
|
|
a36b6ecbef | ||
|
|
2cbe6ae892 | ||
|
|
b0bb31654f | ||
|
|
07237bde6a | ||
|
|
b80702acb8 | ||
|
|
7428bbb8dc | ||
|
|
9a709abb7d | ||
|
|
3236bbd0c5 | ||
|
|
d107c8c531 | ||
|
|
8c671514ab | ||
|
|
f2c16a7d98 | ||
|
|
7c76e65950 |
@@ -248,15 +248,13 @@ lint.per-file-ignores."docker/wait-for-redis.py" = [
|
||||
lint.per-file-ignores."src/documents/models.py" = [
|
||||
"SIM115",
|
||||
]
|
||||
lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
|
||||
"RUF001",
|
||||
]
|
||||
|
||||
lint.isort.force-single-line = true
|
||||
|
||||
[tool.codespell]
|
||||
write-changes = true
|
||||
ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
|
||||
skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/documents/tests/samples/*,*.po,*.json"
|
||||
skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples/mail/*,src/documents/tests/samples/*,*.po,*.json"
|
||||
|
||||
[tool.pytest]
|
||||
minversion = "9.0"
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import tempfile
|
||||
from enum import StrEnum
|
||||
@@ -47,13 +48,15 @@ from documents.signals import document_consumption_started
|
||||
from documents.signals import document_updated
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from documents.utils import compute_checksum
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
|
||||
@@ -68,7 +71,16 @@ def _parser_cleanup(parser: DocumentParser) -> None:
|
||||
|
||||
TODO(stumpylog): Remove me in the future
|
||||
"""
|
||||
if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
|
||||
if isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
@@ -214,7 +226,9 @@ class ConsumerPlugin(
|
||||
version_doc = Document(
|
||||
root_document=root_doc_frozen,
|
||||
version_index=next_version_index + 1,
|
||||
checksum=compute_checksum(file_for_checksum),
|
||||
checksum=hashlib.md5(
|
||||
file_for_checksum.read_bytes(),
|
||||
).hexdigest(),
|
||||
content=text or "",
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
@@ -447,10 +461,21 @@ class ConsumerPlugin(
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
document_parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# New-style parsers use __enter__/__exit__ for resource management.
|
||||
# _parser_cleanup (below) handles __exit__; call __enter__ here.
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
|
||||
if parser_is_new_style:
|
||||
document_parser.__enter__()
|
||||
|
||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
||||
@@ -471,17 +496,12 @@ class ConsumerPlugin(
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
if (
|
||||
isinstance(document_parser, MailDocumentParser)
|
||||
and self.input_doc.mailrule_id
|
||||
):
|
||||
document_parser.parse(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
self.filename,
|
||||
self.input_doc.mailrule_id,
|
||||
|
||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
document_parser.configure(
|
||||
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||
)
|
||||
elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
else:
|
||||
@@ -494,8 +514,8 @@ class ConsumerPlugin(
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
else:
|
||||
thumbnail = document_parser.get_thumbnail(
|
||||
@@ -685,9 +705,10 @@ class ConsumerPlugin(
|
||||
document.archive_path,
|
||||
)
|
||||
|
||||
document.archive_checksum = compute_checksum(
|
||||
Path(archive_path),
|
||||
)
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
@@ -828,7 +849,7 @@ class ConsumerPlugin(
|
||||
title=title[:127],
|
||||
content=text,
|
||||
mime_type=mime_type,
|
||||
checksum=compute_checksum(file_for_checksum),
|
||||
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
|
||||
created=create_date,
|
||||
modified=create_date,
|
||||
page_count=page_count,
|
||||
@@ -945,9 +966,10 @@ class ConsumerPreflightPlugin(
|
||||
|
||||
def pre_check_duplicate(self) -> None:
|
||||
"""
|
||||
Using the SHA256 of the file, check this exact file doesn't already exist
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
checksum = compute_checksum(Path(self.input_doc.original_file))
|
||||
with Path(self.input_doc.original_file).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
existing_doc = Document.global_objects.filter(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
|
||||
@@ -56,7 +56,6 @@ from documents.models import WorkflowTrigger
|
||||
from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||
from documents.utils import compute_checksum
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from paperless import version
|
||||
from paperless.models import ApplicationConfiguration
|
||||
@@ -694,7 +693,7 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
source_stat = source.stat()
|
||||
target_stat = target.stat()
|
||||
if self.compare_checksums and source_checksum:
|
||||
target_checksum = compute_checksum(target)
|
||||
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
|
||||
perform_copy = target_checksum != source_checksum
|
||||
elif (
|
||||
source_stat.st_mtime != target_stat.st_mtime
|
||||
|
||||
@@ -4,6 +4,11 @@ import shutil
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.models import Document
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
logger = logging.getLogger("paperless.management.thumbnails")
|
||||
|
||||
@@ -22,16 +27,38 @@ def _process_document(doc_id: int) -> None:
|
||||
|
||||
parser = parser_class(logging_group=None)
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.__enter__()
|
||||
|
||||
try:
|
||||
thumb = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
document.mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||
else:
|
||||
thumb = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
document.mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
shutil.move(thumb, document.thumbnail_path)
|
||||
finally:
|
||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
||||
parser.cleanup()
|
||||
if parser_is_new_style:
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
|
||||
@@ -1,130 +0,0 @@
|
||||
import hashlib
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
_CHUNK_SIZE = 65536 # 64 KiB — avoids loading entire files into memory
|
||||
_BATCH_SIZE = 500 # documents per bulk_update call
|
||||
_PROGRESS_INTERVAL = 500 # log a progress line every N documents
|
||||
|
||||
|
||||
def _sha256(path: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as fh:
|
||||
while chunk := fh.read(_CHUNK_SIZE):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def recompute_checksums(apps, schema_editor):
|
||||
"""Recompute all document checksums from MD5 to SHA256."""
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
total = Document.objects.count()
|
||||
if total == 0:
|
||||
return
|
||||
|
||||
logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
|
||||
|
||||
batch: list = []
|
||||
processed = 0
|
||||
|
||||
for doc in Document.objects.only(
|
||||
"pk",
|
||||
"filename",
|
||||
"checksum",
|
||||
"archive_filename",
|
||||
"archive_checksum",
|
||||
).iterator(chunk_size=_BATCH_SIZE):
|
||||
updated_fields: list[str] = []
|
||||
|
||||
# Reconstruct source path the same way Document.source_path does
|
||||
fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
|
||||
source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
|
||||
|
||||
if source_path.exists():
|
||||
doc.checksum = _sha256(source_path)
|
||||
updated_fields.append("checksum")
|
||||
else:
|
||||
logger.warning(
|
||||
"Document %s: original file %s not found, checksum not updated.",
|
||||
doc.pk,
|
||||
source_path,
|
||||
)
|
||||
|
||||
# Mirror Document.has_archive_version: archive_filename is not None
|
||||
if doc.archive_filename is not None:
|
||||
archive_path = (
|
||||
settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
|
||||
).resolve()
|
||||
if archive_path.exists():
|
||||
doc.archive_checksum = _sha256(archive_path)
|
||||
updated_fields.append("archive_checksum")
|
||||
else:
|
||||
logger.warning(
|
||||
"Document %s: archive file %s not found, checksum not updated.",
|
||||
doc.pk,
|
||||
archive_path,
|
||||
)
|
||||
|
||||
if updated_fields:
|
||||
batch.append(doc)
|
||||
|
||||
processed += 1
|
||||
|
||||
if len(batch) >= _BATCH_SIZE:
|
||||
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
|
||||
batch.clear()
|
||||
|
||||
if processed % _PROGRESS_INTERVAL == 0:
|
||||
logger.info(
|
||||
"SHA-256 checksum progress: %d/%d (%d%%)",
|
||||
processed,
|
||||
total,
|
||||
processed * 100 // total,
|
||||
)
|
||||
|
||||
if batch:
|
||||
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
|
||||
|
||||
logger.info(
|
||||
"SHA-256 checksum recomputation complete: %d document(s) processed.",
|
||||
total,
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0015_document_version_index_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
help_text="The checksum of the original document.",
|
||||
max_length=64,
|
||||
verbose_name="checksum",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="archive_checksum",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
editable=False,
|
||||
help_text="The checksum of the archived document.",
|
||||
max_length=64,
|
||||
null=True,
|
||||
verbose_name="archive checksum",
|
||||
),
|
||||
),
|
||||
migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
|
||||
]
|
||||
@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
|
||||
checksum = models.CharField(
|
||||
_("checksum"),
|
||||
max_length=64,
|
||||
max_length=32,
|
||||
editable=False,
|
||||
help_text=_("The checksum of the original document."),
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
_("archive checksum"),
|
||||
max_length=64,
|
||||
max_length=32,
|
||||
editable=False,
|
||||
blank=True,
|
||||
null=True,
|
||||
|
||||
@@ -11,6 +11,7 @@ is an identity function that adds no overhead.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
@@ -29,7 +30,6 @@ from django.utils import timezone
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
from documents.utils import compute_checksum
|
||||
from paperless.config import GeneralConfig
|
||||
|
||||
logger = logging.getLogger("paperless.sanity_checker")
|
||||
@@ -218,7 +218,7 @@ def _check_original(
|
||||
|
||||
present_files.discard(source_path)
|
||||
try:
|
||||
checksum = compute_checksum(source_path)
|
||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read original file of document: {e}")
|
||||
else:
|
||||
@@ -255,7 +255,7 @@ def _check_archive(
|
||||
|
||||
present_files.discard(archive_path)
|
||||
try:
|
||||
checksum = compute_checksum(archive_path)
|
||||
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import shutil
|
||||
import uuid
|
||||
@@ -62,9 +63,14 @@ from documents.signals import document_updated
|
||||
from documents.signals.handlers import cleanup_document_deletion
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.utils import compute_checksum
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
from paperless_ai.indexing import llm_index_remove_document
|
||||
from paperless_ai.indexing import update_llm_index
|
||||
@@ -304,7 +310,9 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
|
||||
mime_type = document.mime_type
|
||||
|
||||
parser_class: type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
|
||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
||||
mime_type,
|
||||
)
|
||||
|
||||
if not parser_class:
|
||||
logger.error(
|
||||
@@ -315,19 +323,48 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
|
||||
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
|
||||
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type, document.get_public_filename())
|
||||
parser_is_new_style = isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
thumbnail = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.__enter__()
|
||||
|
||||
try:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.configure(ParserContext())
|
||||
parser.parse(document.source_path, mime_type)
|
||||
else:
|
||||
parser.parse(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||
else:
|
||||
thumbnail = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
checksum = compute_checksum(Path(parser.get_archive_path()))
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
@@ -402,8 +439,20 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
f"Error while parsing document {document} (ID: {document_id})",
|
||||
)
|
||||
finally:
|
||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
||||
parser.cleanup()
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
@shared_task
|
||||
|
||||
@@ -82,8 +82,8 @@ def sample_doc(
|
||||
|
||||
return DocumentFactory(
|
||||
title="test",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
content="test content",
|
||||
pk=1,
|
||||
filename="0000001.pdf",
|
||||
|
||||
@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
|
||||
model = Document
|
||||
|
||||
title = factory.Faker("sentence", nb_words=4)
|
||||
checksum = factory.Faker("sha256")
|
||||
checksum = factory.Faker("md5")
|
||||
content = factory.Faker("paragraph")
|
||||
correspondent = None
|
||||
document_type = None
|
||||
|
||||
@@ -101,13 +101,17 @@ class TestSystemStatus(APITestCase):
|
||||
- The response contains the correct install type
|
||||
"""
|
||||
self.client.force_login(self.user)
|
||||
os.environ["PNGX_CONTAINERIZED"] = "1"
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["install_type"], "docker")
|
||||
os.environ["KUBERNETES_SERVICE_HOST"] = "http://localhost"
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.data["install_type"], "kubernetes")
|
||||
with mock.patch.dict(os.environ, {"PNGX_CONTAINERIZED": "1"}, clear=False):
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["install_type"], "docker")
|
||||
with mock.patch.dict(
|
||||
os.environ,
|
||||
{"PNGX_CONTAINERIZED": "1", "KUBERNETES_SERVICE_HOST": "http://localhost"},
|
||||
clear=False,
|
||||
):
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.data["install_type"], "kubernetes")
|
||||
|
||||
@mock.patch("redis.Redis.execute_command")
|
||||
def test_system_status_redis_ping(self, mock_ping) -> None:
|
||||
|
||||
@@ -36,7 +36,6 @@ from documents.tests.utils import DummyProgressManager
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from documents.tests.utils import GetConsumerMixin
|
||||
from paperless_mail.models import MailRule
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
|
||||
|
||||
class _BaseTestParser(DocumentParser):
|
||||
@@ -245,14 +244,8 @@ class TestConsumer(
|
||||
|
||||
self.assertIsFile(document.archive_path)
|
||||
|
||||
self.assertEqual(
|
||||
document.checksum,
|
||||
"1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
)
|
||||
self.assertEqual(
|
||||
document.archive_checksum,
|
||||
"706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
)
|
||||
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
|
||||
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
|
||||
|
||||
self.assertIsNotFile(filename)
|
||||
|
||||
@@ -1097,7 +1090,7 @@ class TestConsumer(
|
||||
self.assertEqual(command[1], "--replace-input")
|
||||
|
||||
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
||||
@mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
|
||||
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_mail_parser_receives_mailrule(
|
||||
self,
|
||||
@@ -1113,11 +1106,13 @@ class TestConsumer(
|
||||
THEN:
|
||||
- The mail parser should receive the mail rule
|
||||
"""
|
||||
from paperless_mail.signals import get_parser as mail_get_parser
|
||||
|
||||
mock_consumer_declaration_send.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": MailDocumentParser,
|
||||
"parser": mail_get_parser,
|
||||
"mime_types": {"message/rfc822": ".eml"},
|
||||
"weight": 0,
|
||||
},
|
||||
@@ -1129,9 +1124,10 @@ class TestConsumer(
|
||||
with self.get_consumer(
|
||||
filepath=(
|
||||
Path(__file__).parent.parent.parent
|
||||
/ Path("paperless_mail")
|
||||
/ Path("paperless")
|
||||
/ Path("tests")
|
||||
/ Path("samples")
|
||||
/ Path("mail")
|
||||
).resolve()
|
||||
/ "html.eml",
|
||||
source=DocumentSource.MailFetch,
|
||||
@@ -1142,12 +1138,10 @@ class TestConsumer(
|
||||
ConsumerError,
|
||||
):
|
||||
consumer.run()
|
||||
mock_mail_parser_parse.assert_called_once_with(
|
||||
consumer.working_copy,
|
||||
"message/rfc822",
|
||||
file_name="sample.pdf",
|
||||
mailrule=mock_mailrule_get.return_value,
|
||||
)
|
||||
mock_mail_parser_parse.assert_called_once_with(
|
||||
consumer.working_copy,
|
||||
"message/rfc822",
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
|
||||
@@ -63,8 +63,8 @@ class TestExportImport(
|
||||
|
||||
self.d1 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
title="wow1",
|
||||
filename="0000001.pdf",
|
||||
mime_type="application/pdf",
|
||||
@@ -72,21 +72,21 @@ class TestExportImport(
|
||||
)
|
||||
self.d2 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
|
||||
checksum="9c9691e51741c1f4f41a20896af31770",
|
||||
title="wow2",
|
||||
filename="0000002.pdf",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
self.d3 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
|
||||
checksum="d38d7ed02e988e072caf924e0f3fcb76",
|
||||
title="wow2",
|
||||
filename="0000003.pdf",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
self.d4 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
|
||||
checksum="82186aaa94f0b98697d704b90fd1c072",
|
||||
title="wow_dec",
|
||||
filename="0000004.pdf",
|
||||
mime_type="application/pdf",
|
||||
@@ -239,7 +239,7 @@ class TestExportImport(
|
||||
)
|
||||
|
||||
with Path(fname).open("rb") as f:
|
||||
checksum = hashlib.sha256(f.read()).hexdigest()
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element["fields"]["checksum"])
|
||||
|
||||
# Generated field "content_length" should not be exported,
|
||||
@@ -253,7 +253,7 @@ class TestExportImport(
|
||||
self.assertIsFile(fname)
|
||||
|
||||
with Path(fname).open("rb") as f:
|
||||
checksum = hashlib.sha256(f.read()).hexdigest()
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element["fields"]["archive_checksum"])
|
||||
|
||||
elif element["model"] == "documents.note":
|
||||
|
||||
@@ -277,8 +277,8 @@ class TestCommandImport(
|
||||
|
||||
Document.objects.create(
|
||||
content="Content",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
title="wow1",
|
||||
filename="0000001.pdf",
|
||||
mime_type="application/pdf",
|
||||
|
||||
@@ -9,9 +9,9 @@ from documents.parsers import get_default_file_extension
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import get_supported_file_extensions
|
||||
from documents.parsers import is_file_ext_supported
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestParserDiscovery(TestCase):
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import shutil
|
||||
from os import utime
|
||||
@@ -129,28 +128,3 @@ def get_boolean(boolstr: str) -> bool:
|
||||
Return a boolean value from a string representation.
|
||||
"""
|
||||
return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
|
||||
def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
|
||||
"""
|
||||
Compute the SHA-256 checksum of a file.
|
||||
|
||||
Reads the file in chunks to avoid loading the entire file into memory.
|
||||
|
||||
Args:
|
||||
path (Path): Path to the file to hash.
|
||||
chunk_size (int, optional): Number of bytes to read per chunk.
|
||||
Defaults to 65536.
|
||||
|
||||
Returns:
|
||||
str: Hexadecimal SHA-256 digest of the file contents.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
OSError: If the file cannot be read.
|
||||
"""
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
while chunk := f.read(chunk_size):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
@@ -35,6 +35,7 @@ Usage example (third-party parser)::
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Protocol
|
||||
from typing import Self
|
||||
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
|
||||
|
||||
__all__ = [
|
||||
"MetadataEntry",
|
||||
"ParserContext",
|
||||
"ParserProtocol",
|
||||
]
|
||||
|
||||
@@ -73,6 +75,44 @@ class MetadataEntry(TypedDict):
|
||||
"""String representation of the field value."""
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class ParserContext:
|
||||
"""Immutable context passed to a parser before parse().
|
||||
|
||||
The consumer assembles this from the ingestion event and Django
|
||||
settings, then calls ``parser.configure(context)`` before
|
||||
``parser.parse()``. Parsers read only the fields relevant to them;
|
||||
unneeded fields are ignored.
|
||||
|
||||
``frozen=True`` prevents accidental mutation after the consumer
|
||||
hands the context off. ``slots=True`` keeps instances lightweight.
|
||||
|
||||
Fields
|
||||
------
|
||||
mailrule_id : int | None
|
||||
Primary key of the ``MailRule`` that triggered this ingestion,
|
||||
or ``None`` when the document did not arrive via a mail rule.
|
||||
Used by ``MailDocumentParser`` to select the PDF layout.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Future fields (not yet implemented):
|
||||
|
||||
* ``output_type`` — PDF/A variant for archive generation
|
||||
(replaces ``settings.OCR_OUTPUT_TYPE`` reads inside parsers).
|
||||
* ``ocr_mode`` — skip-text, redo, force, etc.
|
||||
(replaces ``settings.OCR_MODE`` reads inside parsers).
|
||||
* ``ocr_language`` — Tesseract language string.
|
||||
(replaces ``settings.OCR_LANGUAGE`` reads inside parsers).
|
||||
|
||||
When those fields are added the consumer will read from Django
|
||||
settings once and populate them here, decoupling parsers from
|
||||
``settings.*`` entirely.
|
||||
"""
|
||||
|
||||
mailrule_id: int | None = None
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class ParserProtocol(Protocol):
|
||||
"""Structural contract for all Paperless-ngx document parsers.
|
||||
@@ -191,6 +231,21 @@ class ParserProtocol(Protocol):
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
"""Apply source context before parse().
|
||||
|
||||
Called by the consumer after instantiation and before parse().
|
||||
The default implementation is a no-op; parsers override only the
|
||||
fields they need.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
context:
|
||||
Immutable context assembled by the consumer for this
|
||||
specific ingestion event.
|
||||
"""
|
||||
...
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
|
||||
834
src/paperless/parsers/mail.py
Normal file
@@ -0,0 +1,834 @@
|
||||
"""
|
||||
Built-in mail document parser.
|
||||
|
||||
Handles message/rfc822 (EML) MIME type by:
|
||||
- Parsing the email using imap_tools
|
||||
- Generating a PDF via Gotenberg (for display and archive)
|
||||
- Extracting text via Tika for HTML content
|
||||
- Extracting metadata from email headers
|
||||
|
||||
The parser always produces a PDF because EML files cannot be rendered
|
||||
natively in a browser (requires_pdf_rendition=True).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from html import escape
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
|
||||
from bleach import clean
|
||||
from bleach import linkify
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.utils.timezone import is_naive
|
||||
from django.utils.timezone import make_aware
|
||||
from gotenberg_client import GotenbergClient
|
||||
from gotenberg_client.constants import A4
|
||||
from gotenberg_client.options import Measurement
|
||||
from gotenberg_client.options import MeasurementUnitType
|
||||
from gotenberg_client.options import PageMarginsType
|
||||
from gotenberg_client.options import PdfAFormat
|
||||
from humanize import naturalsize
|
||||
from imap_tools import MailAttachment
|
||||
from imap_tools import MailMessage
|
||||
from tika_client import TikaClient
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless.models import OutputTypeChoices
|
||||
from paperless.version import __full_version_str__
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
from paperless.parsers import ParserContext
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.mail")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"message/rfc822": ".eml",
|
||||
}
|
||||
|
||||
|
||||
class MailDocumentParser:
|
||||
"""Parse .eml email files for Paperless-ngx.
|
||||
|
||||
Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
|
||||
and sends the HTML part to a Tika server for text extraction. Because
|
||||
EML files cannot be rendered natively in a browser, the parser always
|
||||
produces a PDF rendition (requires_pdf_rendition=True).
|
||||
|
||||
Pass a ``ParserContext`` to ``configure()`` before ``parse()`` to
|
||||
apply mail-rule-specific PDF layout options:
|
||||
|
||||
parser.configure(ParserContext(mailrule_id=rule.pk))
|
||||
parser.parse(path, mime_type)
|
||||
|
||||
Class attributes
|
||||
----------------
|
||||
name : str
|
||||
Human-readable parser name.
|
||||
version : str
|
||||
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||
author : str
|
||||
Maintainer name.
|
||||
url : str
|
||||
Issue tracker / source URL.
|
||||
"""
|
||||
|
||||
name: str = "Paperless-ngx Mail Parser"
|
||||
version: str = __full_version_str__
|
||||
author: str = "Paperless-ngx Contributors"
|
||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict[str, str]:
|
||||
"""Return the MIME types this parser handles.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, str]
|
||||
Mapping of MIME type to preferred file extension.
|
||||
"""
|
||||
return _SUPPORTED_MIME_TYPES
|
||||
|
||||
@classmethod
|
||||
def score(
|
||||
cls,
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> int | None:
|
||||
"""Return the priority score for handling this file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mime_type:
|
||||
Detected MIME type of the file.
|
||||
filename:
|
||||
Original filename including extension.
|
||||
path:
|
||||
Optional filesystem path. Not inspected by this parser.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
10 if the MIME type is supported, otherwise None.
|
||||
"""
|
||||
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||
return 10
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
"""Whether this parser can produce a searchable PDF archive copy.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always False — the mail parser produces a display PDF
|
||||
(requires_pdf_rendition=True), not an optional OCR archive.
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
"""Whether the parser must produce a PDF for the frontend to display.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always True — EML files cannot be rendered natively in a browser,
|
||||
so a PDF conversion is always required for display.
|
||||
"""
|
||||
return True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self._tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
self._text: str | None = None
|
||||
self._date: datetime.datetime | None = None
|
||||
self._archive_path: Path | None = None
|
||||
self._mailrule_id: int | None = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
self._mailrule_id = context.mailrule_id
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
*,
|
||||
produce_archive: bool = True,
|
||||
) -> None:
|
||||
"""Parse the given .eml into formatted text and a PDF archive.
|
||||
|
||||
Call ``configure(ParserContext(mailrule_id=...))`` before this method
|
||||
to apply mail-rule-specific PDF layout options. The ``produce_archive``
|
||||
flag is accepted for protocol compatibility but is always honoured —
|
||||
the mail parser always produces a PDF since EML files cannot be
|
||||
displayed natively.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the .eml file.
|
||||
mime_type:
|
||||
Detected MIME type of the document (should be "message/rfc822").
|
||||
produce_archive:
|
||||
Accepted for protocol compatibility. The PDF rendition is always
|
||||
produced since EML files cannot be displayed natively in a browser.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If the file cannot be parsed or PDF generation fails.
|
||||
"""
|
||||
|
||||
def strip_text(text: str) -> str:
|
||||
"""Reduces the spacing of the given text string."""
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
text = re.sub(r"(\n *)+", "\n", text)
|
||||
return text.strip()
|
||||
|
||||
def build_formatted_text(mail_message: MailMessage) -> str:
|
||||
"""Constructs a formatted string based on the given email."""
|
||||
fmt_text = f"Subject: {mail_message.subject}\n\n"
|
||||
fmt_text += f"From: {mail_message.from_values.full if mail_message.from_values else ''}\n\n"
|
||||
to_list = [address.full for address in mail_message.to_values]
|
||||
fmt_text += f"To: {', '.join(to_list)}\n\n"
|
||||
if mail_message.cc_values:
|
||||
fmt_text += (
|
||||
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
|
||||
)
|
||||
if mail_message.bcc_values:
|
||||
fmt_text += (
|
||||
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
|
||||
)
|
||||
if mail_message.attachments:
|
||||
att = []
|
||||
for a in mail.attachments:
|
||||
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
|
||||
att.append(
|
||||
f"{a.filename} ({attachment_size})",
|
||||
)
|
||||
fmt_text += f"Attachments: {', '.join(att)}\n\n"
|
||||
|
||||
if mail.html:
|
||||
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
|
||||
|
||||
fmt_text += f"\n\n{strip_text(mail.text)}"
|
||||
|
||||
return fmt_text
|
||||
|
||||
logger.debug("Parsing file %s into an email", document_path.name)
|
||||
mail = self.parse_file_to_message(document_path)
|
||||
|
||||
logger.debug("Building formatted text from email")
|
||||
self._text = build_formatted_text(mail)
|
||||
|
||||
if is_naive(mail.date):
|
||||
self._date = make_aware(mail.date)
|
||||
else:
|
||||
self._date = mail.date
|
||||
|
||||
logger.debug("Creating a PDF from the email")
|
||||
if self._mailrule_id:
|
||||
rule = MailRule.objects.get(pk=self._mailrule_id)
|
||||
self._archive_path = self.generate_pdf(
|
||||
mail,
|
||||
MailRule.PdfLayout(rule.pdf_layout),
|
||||
)
|
||||
else:
|
||||
self._archive_path = self.generate_pdf(mail)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
"""Return the plain-text content extracted during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if parse has not been called yet.
|
||||
"""
|
||||
return self._text
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
datetime.datetime | None
|
||||
Date from the email headers, or None if not detected.
|
||||
"""
|
||||
return self._date
|
||||
|
||||
def get_archive_path(self) -> Path | None:
|
||||
"""Return the path to the generated archive PDF, or None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path | None
|
||||
Path to the PDF produced by Gotenberg, or None if parse has not
|
||||
been called yet.
|
||||
"""
|
||||
return self._archive_path
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thumbnail and metadata
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_thumbnail(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
file_name: str | None = None,
|
||||
) -> Path:
|
||||
"""Generate a thumbnail from the PDF rendition of the email.
|
||||
|
||||
Converts the document to PDF first if not already done.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
file_name:
|
||||
Kept for backward compatibility; not used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated WebP thumbnail inside the temporary directory.
|
||||
"""
|
||||
if not self._archive_path:
|
||||
self._archive_path = self.generate_pdf(
|
||||
self.parse_file_to_message(document_path),
|
||||
)
|
||||
|
||||
return make_thumbnail_from_pdf(
|
||||
self._archive_path,
|
||||
self._tempdir,
|
||||
)
|
||||
|
||||
def get_page_count(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in the document.
|
||||
|
||||
Counts pages in the archive PDF produced by a preceding parse()
|
||||
call. Returns ``None`` if parse() has not been called yet or if
|
||||
no archive was produced.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Page count of the archive PDF, or ``None``.
|
||||
"""
|
||||
if self._archive_path is not None:
|
||||
from paperless.parsers.utils import get_page_count_for_pdf
|
||||
|
||||
return get_page_count_for_pdf(self._archive_path, log=logger)
|
||||
return None
|
||||
|
||||
def extract_metadata(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> list[MetadataEntry]:
|
||||
"""Extract metadata from the email headers.
|
||||
|
||||
Returns email headers as metadata entries with prefix "header",
|
||||
plus summary entries for attachments and date.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[MetadataEntry]
|
||||
Sorted list of metadata entries, or ``[]`` on parse failure.
|
||||
"""
|
||||
result: list[MetadataEntry] = []
|
||||
|
||||
try:
|
||||
mail = self.parse_file_to_message(document_path)
|
||||
except ParseError as e:
|
||||
logger.warning(
|
||||
"Error while fetching document metadata for %s: %s",
|
||||
document_path,
|
||||
e,
|
||||
)
|
||||
return result
|
||||
|
||||
for key, header_values in mail.headers.items():
|
||||
value = ", ".join(header_values)
|
||||
try:
|
||||
value.encode("utf-8")
|
||||
except UnicodeEncodeError as e: # pragma: no cover
|
||||
logger.debug("Skipping header %s: %s", key, e)
|
||||
continue
|
||||
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "header",
|
||||
"key": key,
|
||||
"value": value,
|
||||
},
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": "attachments",
|
||||
"value": ", ".join(
|
||||
f"{attachment.filename}"
|
||||
f"({naturalsize(attachment.size, binary=True, format='%.2f')})"
|
||||
for attachment in mail.attachments
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": "date",
|
||||
"value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
|
||||
},
|
||||
)
|
||||
|
||||
result.sort(key=lambda item: (item["prefix"], item["key"]))
|
||||
return result
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Email-specific methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
|
||||
"""Convert the OCR output type setting to a Gotenberg PdfAFormat."""
|
||||
if settings.OCR_OUTPUT_TYPE in {
|
||||
OutputTypeChoices.PDF_A,
|
||||
OutputTypeChoices.PDF_A2,
|
||||
}:
|
||||
return PdfAFormat.A2b
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
|
||||
logger.warning(
|
||||
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||
)
|
||||
return PdfAFormat.A2b
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
|
||||
return PdfAFormat.A3b
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def parse_file_to_message(filepath: Path) -> MailMessage:
|
||||
"""Parse the given .eml file into a MailMessage object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath:
|
||||
Path to the .eml file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MailMessage
|
||||
Parsed mail message.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If the file cannot be parsed or is missing required fields.
|
||||
"""
|
||||
try:
|
||||
with filepath.open("rb") as eml:
|
||||
parsed = MailMessage.from_bytes(eml.read())
|
||||
if parsed.from_values is None:
|
||||
raise ParseError(
|
||||
f"Could not parse {filepath}: Missing 'from'",
|
||||
)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse {filepath}: {err}",
|
||||
) from err
|
||||
|
||||
return parsed
|
||||
|
||||
def tika_parse(self, html: str) -> str:
|
||||
"""Send HTML content to the Tika server for text extraction.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
html:
|
||||
HTML string to parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Extracted plain text.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If the Tika server cannot be reached or returns an error.
|
||||
"""
|
||||
logger.info("Sending content to Tika server")
|
||||
|
||||
try:
|
||||
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
||||
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
||||
|
||||
if parsed.content is not None:
|
||||
return parsed.content.strip()
|
||||
return ""
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse content with tika server at "
|
||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||
) from err
|
||||
|
||||
def generate_pdf(
|
||||
self,
|
||||
mail_message: MailMessage,
|
||||
pdf_layout: MailRule.PdfLayout | None = None,
|
||||
) -> Path:
|
||||
"""Generate a PDF from the email message.
|
||||
|
||||
Creates separate PDFs for the email body and HTML content, then
|
||||
merges them according to the requested layout.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mail_message:
|
||||
Parsed email message.
|
||||
pdf_layout:
|
||||
Layout option for the PDF. Falls back to the
|
||||
EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated PDF inside the temporary directory.
|
||||
"""
|
||||
archive_path = Path(self._tempdir) / "merged.pdf"
|
||||
|
||||
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
|
||||
|
||||
if pdf_layout is None:
|
||||
pdf_layout = MailRule.PdfLayout(settings.EMAIL_PARSE_DEFAULT_LAYOUT)
|
||||
|
||||
# If no HTML content, create the PDF from the message.
|
||||
# Otherwise, create 2 PDFs and merge them with Gotenberg.
|
||||
if not mail_message.html:
|
||||
archive_path.write_bytes(mail_pdf_file.read_bytes())
|
||||
else:
|
||||
pdf_of_html_content = self.generate_pdf_from_html(
|
||||
mail_message.html,
|
||||
mail_message.attachments,
|
||||
)
|
||||
|
||||
logger.debug("Merging email text and HTML content into single PDF")
|
||||
|
||||
with (
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client,
|
||||
client.merge.merge() as route,
|
||||
):
|
||||
# Configure requested PDF/A formatting, if any
|
||||
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||
if pdf_a_format is not None:
|
||||
route.pdf_format(pdf_a_format)
|
||||
|
||||
match pdf_layout:
|
||||
case MailRule.PdfLayout.HTML_TEXT:
|
||||
route.merge([pdf_of_html_content, mail_pdf_file])
|
||||
case MailRule.PdfLayout.HTML_ONLY:
|
||||
route.merge([pdf_of_html_content])
|
||||
case MailRule.PdfLayout.TEXT_ONLY:
|
||||
route.merge([mail_pdf_file])
|
||||
case MailRule.PdfLayout.TEXT_HTML | _:
|
||||
route.merge([mail_pdf_file, pdf_of_html_content])
|
||||
|
||||
try:
|
||||
response = route.run()
|
||||
archive_path.write_bytes(response.content)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while merging email HTML into PDF: {err}",
|
||||
) from err
|
||||
|
||||
return archive_path
|
||||
|
||||
def mail_to_html(self, mail: MailMessage) -> Path:
|
||||
"""Convert the given email into an HTML file using a template.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mail:
|
||||
Parsed mail message.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the rendered HTML file inside the temporary directory.
|
||||
"""
|
||||
|
||||
def clean_html(text: str) -> str:
|
||||
"""Attempt to clean, escape, and linkify the given HTML string."""
|
||||
if isinstance(text, list):
|
||||
text = "\n".join([str(e) for e in text])
|
||||
if not isinstance(text, str):
|
||||
text = str(text)
|
||||
text = escape(text)
|
||||
text = clean(text)
|
||||
text = linkify(text, parse_email=True)
|
||||
text = text.replace("\n", "<br>")
|
||||
return text
|
||||
|
||||
data = {}
|
||||
|
||||
data["subject"] = clean_html(mail.subject)
|
||||
if data["subject"]:
|
||||
data["subject_label"] = "Subject"
|
||||
data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
|
||||
if data["from"]:
|
||||
data["from_label"] = "From"
|
||||
data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
|
||||
if data["to"]:
|
||||
data["to_label"] = "To"
|
||||
data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
|
||||
if data["cc"]:
|
||||
data["cc_label"] = "CC"
|
||||
data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
|
||||
if data["bcc"]:
|
||||
data["bcc_label"] = "BCC"
|
||||
|
||||
att = []
|
||||
for a in mail.attachments:
|
||||
att.append(
|
||||
f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
|
||||
)
|
||||
data["attachments"] = clean_html(", ".join(att))
|
||||
if data["attachments"]:
|
||||
data["attachments_label"] = "Attachments"
|
||||
|
||||
data["date"] = clean_html(
|
||||
timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
|
||||
)
|
||||
data["content"] = clean_html(mail.text.strip())
|
||||
|
||||
from django.template.loader import render_to_string
|
||||
|
||||
html_file = Path(self._tempdir) / "email_as_html.html"
|
||||
html_file.write_text(render_to_string("email_msg_template.html", context=data))
|
||||
|
||||
return html_file
|
||||
|
||||
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
|
||||
"""Create a PDF from the email body using an HTML template and Gotenberg.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mail:
|
||||
Parsed mail message.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated PDF inside the temporary directory.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If Gotenberg returns an error.
|
||||
"""
|
||||
logger.info("Converting mail to PDF")
|
||||
|
||||
css_file = (
|
||||
Path(__file__).parent.parent.parent
|
||||
/ "paperless_mail"
|
||||
/ "templates"
|
||||
/ "output.css"
|
||||
)
|
||||
email_html_file = self.mail_to_html(mail)
|
||||
|
||||
with (
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client,
|
||||
client.chromium.html_to_pdf() as route,
|
||||
):
|
||||
# Configure requested PDF/A formatting, if any
|
||||
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||
if pdf_a_format is not None:
|
||||
route.pdf_format(pdf_a_format)
|
||||
|
||||
try:
|
||||
response = (
|
||||
route.index(email_html_file)
|
||||
.resource(css_file)
|
||||
.margins(
|
||||
PageMarginsType(
|
||||
top=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
bottom=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
left=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
right=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
),
|
||||
)
|
||||
.size(A4)
|
||||
.scale(1.0)
|
||||
.run()
|
||||
)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting email to PDF: {err}",
|
||||
) from err
|
||||
|
||||
email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
|
||||
email_as_pdf_file.write_bytes(response.content)
|
||||
|
||||
return email_as_pdf_file
|
||||
|
||||
def generate_pdf_from_html(
|
||||
self,
|
||||
orig_html: str,
|
||||
attachments: list[MailAttachment],
|
||||
) -> Path:
|
||||
"""Generate a PDF from the HTML content of the email.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
orig_html:
|
||||
Raw HTML string from the email body.
|
||||
attachments:
|
||||
List of email attachments (used as inline resources).
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated PDF inside the temporary directory.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If Gotenberg returns an error.
|
||||
"""
|
||||
|
||||
def clean_html_script(text: str) -> str:
|
||||
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
|
||||
text = compiled_open.sub("<div hidden ", text)
|
||||
|
||||
compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
|
||||
text = compiled_close.sub("</div", text)
|
||||
return text
|
||||
|
||||
logger.info("Converting message html to PDF")
|
||||
|
||||
tempdir = Path(self._tempdir)
|
||||
|
||||
html_clean = clean_html_script(orig_html)
|
||||
html_clean_file = tempdir / "index.html"
|
||||
html_clean_file.write_text(html_clean)
|
||||
|
||||
with (
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client,
|
||||
client.chromium.html_to_pdf() as route,
|
||||
):
|
||||
# Configure requested PDF/A formatting, if any
|
||||
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||
if pdf_a_format is not None:
|
||||
route.pdf_format(pdf_a_format)
|
||||
|
||||
# Add attachments as resources, cleaning the filename and replacing
|
||||
# it in the index file for inclusion
|
||||
for attachment in attachments:
|
||||
# Clean the attachment name to be valid
|
||||
name_cid = f"cid:{attachment.content_id}"
|
||||
name_clean = "".join(e for e in name_cid if e.isalnum())
|
||||
|
||||
# Write attachment payload to a temp file
|
||||
temp_file = tempdir / name_clean
|
||||
temp_file.write_bytes(attachment.payload)
|
||||
|
||||
route.resource(temp_file)
|
||||
|
||||
# Replace as needed the name with the clean name
|
||||
html_clean = html_clean.replace(name_cid, name_clean)
|
||||
|
||||
# Now store the cleaned up HTML version
|
||||
html_clean_file = tempdir / "index.html"
|
||||
html_clean_file.write_text(html_clean)
|
||||
# This is our index file, the main page basically
|
||||
route.index(html_clean_file)
|
||||
|
||||
# Set page size, margins
|
||||
route.margins(
|
||||
PageMarginsType(
|
||||
top=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
bottom=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
left=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
right=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
),
|
||||
).size(A4).scale(1.0)
|
||||
|
||||
try:
|
||||
response = route.run()
|
||||
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting document to PDF: {err}",
|
||||
) from err
|
||||
|
||||
html_pdf = tempdir / "html.pdf"
|
||||
html_pdf.write_bytes(response.content)
|
||||
return html_pdf
|
||||
@@ -193,11 +193,17 @@ class ParserRegistry:
|
||||
that log output is predictable; scoring determines which parser wins
|
||||
at runtime regardless of registration order.
|
||||
"""
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
self.register_builtin(TextDocumentParser)
|
||||
self.register_builtin(RemoteDocumentParser)
|
||||
self.register_builtin(TikaDocumentParser)
|
||||
self.register_builtin(MailDocumentParser)
|
||||
self.register_builtin(RasterisedDocumentParser)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Discovery
|
||||
|
||||
433
src/paperless/parsers/remote.py
Normal file
@@ -0,0 +1,433 @@
|
||||
"""
|
||||
Built-in remote-OCR document parser.
|
||||
|
||||
Handles documents by sending them to a configured remote OCR engine
|
||||
(currently Azure AI Vision / Document Intelligence) and retrieving both
|
||||
the extracted text and a searchable PDF with an embedded text layer.
|
||||
|
||||
When no engine is configured, ``score()`` returns ``None`` so the parser
|
||||
is effectively invisible to the registry — the tesseract parser handles
|
||||
these MIME types instead.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
from paperless.parsers import ParserContext
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.remote")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/tiff": ".tiff",
|
||||
"image/bmp": ".bmp",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
|
||||
|
||||
class RemoteEngineConfig:
|
||||
"""Holds and validates the remote OCR engine configuration."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine: str | None,
|
||||
api_key: str | None = None,
|
||||
endpoint: str | None = None,
|
||||
) -> None:
|
||||
self.engine = engine
|
||||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
|
||||
def engine_is_valid(self) -> bool:
|
||||
"""Return True when the engine is known and fully configured."""
|
||||
return (
|
||||
self.engine in ("azureai",)
|
||||
and self.api_key is not None
|
||||
and not (self.engine == "azureai" and self.endpoint is None)
|
||||
)
|
||||
|
||||
|
||||
class RemoteDocumentParser:
|
||||
"""Parse documents via a remote OCR API (currently Azure AI Vision).
|
||||
|
||||
This parser sends documents to a remote engine that returns both
|
||||
extracted text and a searchable PDF with an embedded text layer.
|
||||
It does not depend on Tesseract or ocrmypdf.
|
||||
|
||||
Class attributes
|
||||
----------------
|
||||
name : str
|
||||
Human-readable parser name.
|
||||
version : str
|
||||
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||
author : str
|
||||
Maintainer name.
|
||||
url : str
|
||||
Issue tracker / source URL.
|
||||
"""
|
||||
|
||||
name: str = "Paperless-ngx Remote OCR Parser"
|
||||
version: str = __full_version_str__
|
||||
author: str = "Paperless-ngx Contributors"
|
||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict[str, str]:
|
||||
"""Return the MIME types this parser can handle.
|
||||
|
||||
The full set is always returned regardless of whether a remote
|
||||
engine is configured. The ``score()`` method handles the
|
||||
"am I active?" logic by returning ``None`` when not configured.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, str]
|
||||
Mapping of MIME type to preferred file extension.
|
||||
"""
|
||||
return _SUPPORTED_MIME_TYPES
|
||||
|
||||
@classmethod
|
||||
def score(
|
||||
cls,
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> int | None:
|
||||
"""Return the priority score for handling this file, or None.
|
||||
|
||||
Returns ``None`` when no valid remote engine is configured,
|
||||
making the parser invisible to the registry for this file.
|
||||
When configured, returns 20 — higher than the Tesseract parser's
|
||||
default of 10 — so the remote engine takes priority.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mime_type:
|
||||
Detected MIME type of the file.
|
||||
filename:
|
||||
Original filename including extension.
|
||||
path:
|
||||
Optional filesystem path. Not inspected by this parser.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
20 when the remote engine is configured and the MIME type is
|
||||
supported, otherwise None.
|
||||
"""
|
||||
config = RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
if not config.engine_is_valid():
|
||||
return None
|
||||
if mime_type not in _SUPPORTED_MIME_TYPES:
|
||||
return None
|
||||
return 20
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
"""Whether this parser can produce a searchable PDF archive copy.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always True — the remote engine always returns a PDF with an
|
||||
embedded text layer that serves as the archive copy.
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
"""Whether the parser must produce a PDF for the frontend to display.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always False — all supported originals are displayable by
|
||||
the browser (PDF) or handled via the archive copy (images).
|
||||
"""
|
||||
return False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self._tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
self._logging_group = logging_group
|
||||
self._text: str | None = None
|
||||
self._archive_path: Path | None = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
pass
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
*,
|
||||
produce_archive: bool = True,
|
||||
) -> None:
|
||||
"""Send the document to the remote engine and store results.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the document file to parse.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
produce_archive:
|
||||
Ignored — the remote engine always returns a searchable PDF,
|
||||
which is stored as the archive copy regardless of this flag.
|
||||
"""
|
||||
config = RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
|
||||
if not config.engine_is_valid():
|
||||
logger.warning(
|
||||
"No valid remote parser engine is configured, content will be empty.",
|
||||
)
|
||||
self._text = ""
|
||||
return
|
||||
|
||||
if config.engine == "azureai":
|
||||
self._text = self._azure_ai_vision_parse(document_path, config)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
"""Return the plain-text content extracted during parse."""
|
||||
return self._text
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
datetime.datetime | None
|
||||
Always None — the remote parser does not detect dates.
|
||||
"""
|
||||
return None
|
||||
|
||||
def get_archive_path(self) -> Path | None:
|
||||
"""Return the path to the generated archive PDF, or None."""
|
||||
return self._archive_path
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thumbnail and metadata
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||
"""Generate a thumbnail image for the document.
|
||||
|
||||
Uses the archive PDF produced by the remote engine when available,
|
||||
otherwise falls back to the original document path (PDF inputs).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated WebP thumbnail inside the temp directory.
|
||||
"""
|
||||
# make_thumbnail_from_pdf lives in documents.parsers for now;
|
||||
# it will move to paperless.parsers.utils when the tesseract
|
||||
# parser is migrated in a later phase.
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
|
||||
return make_thumbnail_from_pdf(
|
||||
self._archive_path or document_path,
|
||||
self._tempdir,
|
||||
self._logging_group,
|
||||
)
|
||||
|
||||
def get_page_count(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in a PDF document.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Page count for PDF inputs, or ``None`` for other MIME types.
|
||||
"""
|
||||
if mime_type != "application/pdf":
|
||||
return None
|
||||
|
||||
from paperless.parsers.utils import get_page_count_for_pdf
|
||||
|
||||
return get_page_count_for_pdf(document_path, log=logger)
|
||||
|
||||
def extract_metadata(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> list[MetadataEntry]:
|
||||
"""Extract format-specific metadata from the document.
|
||||
|
||||
Delegates to the shared pikepdf-based extractor for PDF files.
|
||||
Returns ``[]`` for all other MIME types.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the file to extract metadata from.
|
||||
mime_type:
|
||||
MIME type of the file. May be ``"application/pdf"`` when
|
||||
called for the archive version of an image original.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[MetadataEntry]
|
||||
Zero or more metadata entries.
|
||||
"""
|
||||
if mime_type != "application/pdf":
|
||||
return []
|
||||
|
||||
from paperless.parsers.utils import extract_pdf_metadata
|
||||
|
||||
return extract_pdf_metadata(document_path, log=logger)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _azure_ai_vision_parse(
|
||||
self,
|
||||
file: Path,
|
||||
config: RemoteEngineConfig,
|
||||
) -> str | None:
|
||||
"""Send ``file`` to Azure AI Document Intelligence and return text.
|
||||
|
||||
Downloads the searchable PDF output from Azure and stores it at
|
||||
``self._archive_path``. Returns the extracted text content, or
|
||||
``None`` on failure (the error is logged).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file:
|
||||
Absolute path to the document to analyse.
|
||||
config:
|
||||
Validated remote engine configuration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if the Azure call failed.
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
# Callers must have already validated config via engine_is_valid():
|
||||
# engine_is_valid() asserts api_key is not None and (for azureai)
|
||||
# endpoint is not None, so these casts are provably safe.
|
||||
assert config.endpoint is not None
|
||||
assert config.api_key is not None
|
||||
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
||||
from azure.ai.documentintelligence.models import DocumentContentFormat
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
client = DocumentIntelligenceClient(
|
||||
endpoint=config.endpoint,
|
||||
credential=AzureKeyCredential(config.api_key),
|
||||
)
|
||||
|
||||
try:
|
||||
with file.open("rb") as f:
|
||||
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
||||
poller = client.begin_analyze_document(
|
||||
model_id="prebuilt-read",
|
||||
body=analyze_request,
|
||||
output_content_format=DocumentContentFormat.TEXT,
|
||||
output=[AnalyzeOutputOption.PDF],
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
poller.wait()
|
||||
result_id = poller.details["operation_id"]
|
||||
result = poller.result()
|
||||
|
||||
self._archive_path = self._tempdir / "archive.pdf"
|
||||
with self._archive_path.open("wb") as f:
|
||||
for chunk in client.get_analyze_result_pdf(
|
||||
model_id="prebuilt-read",
|
||||
result_id=result_id,
|
||||
):
|
||||
f.write(chunk)
|
||||
|
||||
return result.content
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Azure AI Vision parsing failed: %s", e)
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
return None
|
||||
@@ -1,13 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Any
|
||||
from typing import Self
|
||||
|
||||
from django.conf import settings
|
||||
from PIL import Image
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
@@ -16,6 +21,28 @@ from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
from paperless.parsers import ParserContext
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.tesseract")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/tiff": ".tif",
|
||||
"image/gif": ".gif",
|
||||
"image/bmp": ".bmp",
|
||||
"image/webp": ".webp",
|
||||
"image/heic": ".heic",
|
||||
}
|
||||
|
||||
|
||||
class NoTextFoundException(Exception):
|
||||
@@ -26,81 +53,125 @@ class RtlLanguageException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
class RasterisedDocumentParser:
|
||||
"""
|
||||
This parser uses Tesseract to try and get some text out of a rasterised
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.tesseract"
|
||||
name: str = "Paperless-ngx Tesseract OCR Parser"
|
||||
version: str = __full_version_str__
|
||||
author: str = "Paperless-ngx Contributors"
|
||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||
|
||||
def get_settings(self) -> OcrConfig:
|
||||
"""
|
||||
This parser uses the OCR configuration settings to parse documents
|
||||
"""
|
||||
return OcrConfig()
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_page_count(self, document_path, mime_type):
|
||||
page_count = None
|
||||
if mime_type == "application/pdf":
|
||||
try:
|
||||
import pikepdf
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict[str, str]:
|
||||
return _SUPPORTED_MIME_TYPES
|
||||
|
||||
with pikepdf.Pdf.open(document_path) as pdf:
|
||||
page_count = len(pdf.pages)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"Unable to determine PDF page count {document_path}: {e}",
|
||||
)
|
||||
return page_count
|
||||
@classmethod
|
||||
def score(
|
||||
cls,
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> int | None:
|
||||
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||
return 10
|
||||
return None
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
result = []
|
||||
if mime_type == "application/pdf":
|
||||
import pikepdf
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
return True
|
||||
|
||||
pdf = pikepdf.open(document_path)
|
||||
meta = pdf.open_metadata()
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join([str(e) for e in value])
|
||||
value = str(value)
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
if m is None: # pragma: no cover
|
||||
continue
|
||||
namespace = m.group(1)
|
||||
key_value = m.group(2)
|
||||
try:
|
||||
namespace.encode("utf-8")
|
||||
key_value.encode("utf-8")
|
||||
except UnicodeEncodeError as e: # pragma: no cover
|
||||
self.log.debug(f"Skipping metadata key {key}: {e}")
|
||||
continue
|
||||
result.append(
|
||||
{
|
||||
"namespace": namespace,
|
||||
"prefix": meta.REVERSE_NS[namespace],
|
||||
"key": key_value,
|
||||
"value": value,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"Error while reading metadata {key}: {value}. Error: {e}",
|
||||
)
|
||||
return result
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
return False
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self.tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
self.settings = OcrConfig()
|
||||
self.archive_path: Path | None = None
|
||||
self.text: str | None = None
|
||||
self.date: datetime.datetime | None = None
|
||||
self.log = logger
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
logger.debug("Cleaning up temporary directory %s", self.tempdir)
|
||||
shutil.rmtree(self.tempdir, ignore_errors=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
return self.text
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
return self.date
|
||||
|
||||
def get_archive_path(self) -> Path | None:
|
||||
return self.archive_path
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thumbnail, page count, and metadata
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||
return make_thumbnail_from_pdf(
|
||||
self.archive_path or document_path,
|
||||
self.archive_path or Path(document_path),
|
||||
self.tempdir,
|
||||
self.logging_group,
|
||||
)
|
||||
|
||||
def is_image(self, mime_type) -> bool:
|
||||
def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
|
||||
if mime_type == "application/pdf":
|
||||
from paperless.parsers.utils import get_page_count_for_pdf
|
||||
|
||||
return get_page_count_for_pdf(Path(document_path), log=self.log)
|
||||
return None
|
||||
|
||||
def extract_metadata(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> list[MetadataEntry]:
|
||||
if mime_type != "application/pdf":
|
||||
return []
|
||||
|
||||
from paperless.parsers.utils import extract_pdf_metadata
|
||||
|
||||
return extract_pdf_metadata(Path(document_path), log=self.log)
|
||||
|
||||
def is_image(self, mime_type: str) -> bool:
|
||||
return mime_type in [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
@@ -111,25 +182,25 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
"image/heic",
|
||||
]
|
||||
|
||||
def has_alpha(self, image) -> bool:
|
||||
def has_alpha(self, image: Path) -> bool:
|
||||
with Image.open(image) as im:
|
||||
return im.mode in ("RGBA", "LA")
|
||||
|
||||
def remove_alpha(self, image_path: str) -> Path:
|
||||
def remove_alpha(self, image_path: Path) -> Path:
|
||||
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
||||
run_subprocess(
|
||||
[
|
||||
settings.CONVERT_BINARY,
|
||||
"-alpha",
|
||||
"off",
|
||||
image_path,
|
||||
no_alpha_image,
|
||||
str(image_path),
|
||||
str(no_alpha_image),
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
return no_alpha_image
|
||||
|
||||
def get_dpi(self, image) -> int | None:
|
||||
def get_dpi(self, image: Path) -> int | None:
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
x, _ = im.info["dpi"]
|
||||
@@ -138,7 +209,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def calculate_a4_dpi(self, image) -> int | None:
|
||||
def calculate_a4_dpi(self, image: Path) -> int | None:
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
width, _ = im.size
|
||||
@@ -156,6 +227,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
sidecar_file: Path | None,
|
||||
pdf_file: Path,
|
||||
) -> str | None:
|
||||
text: str | None = None
|
||||
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||
# the whole text, so do not utilize it in that case
|
||||
if (
|
||||
@@ -163,7 +235,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
and sidecar_file.is_file()
|
||||
and self.settings.mode != "redo"
|
||||
):
|
||||
text = self.read_file_handle_unicode_errors(sidecar_file)
|
||||
text = read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
if "[OCR skipped on page" not in text:
|
||||
# This happens when there's already text in the input file.
|
||||
@@ -191,12 +263,12 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
pdf_file,
|
||||
str(pdf_file),
|
||||
tmp.name,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
text = self.read_file_handle_unicode_errors(Path(tmp.name))
|
||||
text = read_file_handle_unicode_errors(Path(tmp.name))
|
||||
|
||||
return post_process_text(text)
|
||||
|
||||
@@ -211,16 +283,14 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
def construct_ocrmypdf_parameters(
|
||||
self,
|
||||
input_file,
|
||||
mime_type,
|
||||
output_file,
|
||||
sidecar_file,
|
||||
input_file: Path,
|
||||
mime_type: str,
|
||||
output_file: Path,
|
||||
sidecar_file: Path,
|
||||
*,
|
||||
safe_fallback=False,
|
||||
):
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(self.settings, OcrConfig)
|
||||
ocrmypdf_args = {
|
||||
safe_fallback: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
ocrmypdf_args: dict[str, Any] = {
|
||||
"input_file_or_options": input_file,
|
||||
"output_file": output_file,
|
||||
# need to use threads, since this will be run in daemonized
|
||||
@@ -330,7 +400,13 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
return ocrmypdf_args
|
||||
|
||||
def parse(self, document_path: Path, mime_type, file_name=None) -> None:
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
*,
|
||||
produce_archive: bool = True,
|
||||
) -> None:
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||
VALID_TEXT_LENGTH = 50
|
||||
@@ -458,7 +534,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.text = ""
|
||||
|
||||
|
||||
def post_process_text(text):
|
||||
def post_process_text(text: str | None) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
|
||||
@@ -27,6 +27,7 @@ if TYPE_CHECKING:
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
from paperless.parsers import ParserContext
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.text")
|
||||
|
||||
@@ -156,6 +157,9 @@ class TextDocumentParser:
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
pass
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
|
||||
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
from paperless.parsers import ParserContext
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.tika")
|
||||
|
||||
@@ -205,6 +206,9 @@ class TikaDocumentParser:
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
pass
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
@@ -340,11 +344,19 @@ class TikaDocumentParser:
|
||||
) -> int | None:
|
||||
"""Return the number of pages in the document.
|
||||
|
||||
Counts pages in the archive PDF produced by a preceding parse()
|
||||
call. Returns ``None`` if parse() has not been called yet or if
|
||||
no archive was produced.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Always None — page count is not available from Tika.
|
||||
Page count of the archive PDF, or ``None``.
|
||||
"""
|
||||
if self._archive_path is not None:
|
||||
from paperless.parsers.utils import get_page_count_for_pdf
|
||||
|
||||
return get_page_count_for_pdf(self._archive_path, log=logger)
|
||||
return None
|
||||
|
||||
def extract_metadata(
|
||||
|
||||
158
src/paperless/parsers/utils.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""
|
||||
Shared utilities for Paperless-ngx document parsers.
|
||||
|
||||
Functions here are format-neutral helpers that multiple parsers need.
|
||||
Keeping them here avoids parsers inheriting from each other just to
|
||||
share implementation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
logger = logging.getLogger("paperless.parsers.utils")
|
||||
|
||||
|
||||
def read_file_handle_unicode_errors(
|
||||
filepath: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> str:
|
||||
"""Read a file as UTF-8 text, replacing invalid bytes rather than raising.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath:
|
||||
Absolute path to the file to read.
|
||||
log:
|
||||
Logger to use for warnings. Falls back to the module-level logger
|
||||
when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
File content as a string, with any invalid UTF-8 sequences replaced
|
||||
by the Unicode replacement character.
|
||||
"""
|
||||
_log = log or logger
|
||||
try:
|
||||
return filepath.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError as e:
|
||||
_log.warning("Unicode error during text reading, continuing: %s", e)
|
||||
return filepath.read_bytes().decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def get_page_count_for_pdf(
|
||||
document_path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in a PDF file using pikepdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger to use for warnings. Falls back to the module-level logger
|
||||
when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Page count, or ``None`` if the file cannot be opened or is not a
|
||||
valid PDF.
|
||||
"""
|
||||
import pikepdf
|
||||
|
||||
_log = log or logger
|
||||
|
||||
try:
|
||||
with pikepdf.Pdf.open(document_path) as pdf:
|
||||
return len(pdf.pages)
|
||||
except Exception as e:
|
||||
_log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
|
||||
return None
|
||||
|
||||
|
||||
def extract_pdf_metadata(
|
||||
document_path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> list[MetadataEntry]:
|
||||
"""Extract XMP/PDF metadata from a PDF file using pikepdf.
|
||||
|
||||
Reads all XMP metadata entries from the document and returns them as a
|
||||
list of ``MetadataEntry`` dicts. The method never raises — any failure
|
||||
to open the file or read a specific key is logged and skipped.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger to use for warnings and debug messages. Falls back to the
|
||||
module-level logger when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[MetadataEntry]
|
||||
Zero or more metadata entries. Returns ``[]`` if the file cannot
|
||||
be opened or contains no readable XMP metadata.
|
||||
"""
|
||||
import pikepdf
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
_log = log or logger
|
||||
result: list[MetadataEntry] = []
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
|
||||
try:
|
||||
pdf = pikepdf.open(document_path)
|
||||
meta = pdf.open_metadata()
|
||||
except Exception as e:
|
||||
_log.warning("Could not open PDF metadata for %s: %s", document_path, e)
|
||||
return []
|
||||
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join(str(e) for e in value)
|
||||
value = str(value)
|
||||
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
if m is None:
|
||||
continue
|
||||
|
||||
namespace = m.group(1)
|
||||
key_value = m.group(2)
|
||||
|
||||
try:
|
||||
namespace.encode("utf-8")
|
||||
key_value.encode("utf-8")
|
||||
except UnicodeEncodeError as enc_err:
|
||||
_log.debug("Skipping metadata key %s: %s", key, enc_err)
|
||||
continue
|
||||
|
||||
result.append(
|
||||
MetadataEntry(
|
||||
namespace=namespace,
|
||||
prefix=meta.REVERSE_NS[namespace],
|
||||
key=key_value,
|
||||
value=value,
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
_log.warning(
|
||||
"Error reading metadata key %s value %s: %s",
|
||||
key,
|
||||
value,
|
||||
e,
|
||||
)
|
||||
|
||||
return result
|
||||
@@ -6,16 +6,29 @@ so it is easy to see which files belong to which test module.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import contextmanager
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from django.test import override_settings
|
||||
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
#: Type for the ``make_tesseract_parser`` fixture factory.
|
||||
MakeTesseractParser = Callable[..., Generator[RasterisedDocumentParser, None, None]]
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
@@ -77,6 +90,92 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def remote_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the remote parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/remote/``
|
||||
"""
|
||||
return samples_dir / "remote"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_pdf_file(remote_samples_dir: Path) -> Path:
|
||||
"""Path to a simple digital PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``remote/simple-digital.pdf``.
|
||||
"""
|
||||
return remote_samples_dir / "simple-digital.pdf"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser instance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
|
||||
"""Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
|
||||
|
||||
Yields
|
||||
------
|
||||
RemoteDocumentParser
|
||||
A ready-to-use parser instance.
|
||||
"""
|
||||
with RemoteDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser settings helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||
"""Configure Django settings for a valid Azure AI OCR engine.
|
||||
|
||||
Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
|
||||
``REMOTE_OCR_ENDPOINT`` to test values. Settings are restored
|
||||
automatically after the test by pytest-django.
|
||||
|
||||
Returns
|
||||
-------
|
||||
SettingsWrapper
|
||||
The modified settings object (for chaining further overrides).
|
||||
"""
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = "test-api-key"
|
||||
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||
return settings
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||
"""Configure Django settings with no remote engine configured.
|
||||
|
||||
Returns
|
||||
-------
|
||||
SettingsWrapper
|
||||
The modified settings object.
|
||||
"""
|
||||
settings.REMOTE_OCR_ENGINE = None
|
||||
settings.REMOTE_OCR_API_KEY = None
|
||||
settings.REMOTE_OCR_ENDPOINT = None
|
||||
return settings
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tika parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
@@ -158,3 +257,544 @@ def tika_parser() -> Generator[TikaDocumentParser, None, None]:
|
||||
"""
|
||||
with TikaDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Mail parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def mail_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the mail parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/mail/``
|
||||
"""
|
||||
return samples_dir / "mail"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def broken_email_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to a broken/malformed EML sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/broken.eml``.
|
||||
"""
|
||||
return mail_samples_dir / "broken.eml"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_txt_email_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to a plain-text email sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/simple_text.eml``.
|
||||
"""
|
||||
return mail_samples_dir / "simple_text.eml"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_txt_email_pdf_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the expected PDF rendition of the plain-text email.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/simple_text.eml.pdf``.
|
||||
"""
|
||||
return mail_samples_dir / "simple_text.eml.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_txt_email_thumbnail_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the expected thumbnail for the plain-text email.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/simple_text.eml.pdf.webp``.
|
||||
"""
|
||||
return mail_samples_dir / "simple_text.eml.pdf.webp"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to an HTML email sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/html.eml``.
|
||||
"""
|
||||
return mail_samples_dir / "html.eml"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_pdf_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the expected PDF rendition of the HTML email.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/html.eml.pdf``.
|
||||
"""
|
||||
return mail_samples_dir / "html.eml.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_thumbnail_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the expected thumbnail for the HTML email.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/html.eml.pdf.webp``.
|
||||
"""
|
||||
return mail_samples_dir / "html.eml.pdf.webp"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_html_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the HTML body of the HTML email sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/html.eml.html``.
|
||||
"""
|
||||
return mail_samples_dir / "html.eml.html"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def merged_pdf_first(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the first PDF used in PDF-merge tests.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/first.pdf``.
|
||||
"""
|
||||
return mail_samples_dir / "first.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def merged_pdf_second(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the second PDF used in PDF-merge tests.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/second.pdf``.
|
||||
"""
|
||||
return mail_samples_dir / "second.pdf"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Mail parser instance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mail_parser() -> Generator[MailDocumentParser, None, None]:
|
||||
"""Yield a MailDocumentParser and clean up its temporary directory afterwards.
|
||||
|
||||
Yields
|
||||
------
|
||||
MailDocumentParser
|
||||
A ready-to-use parser instance.
|
||||
"""
|
||||
with MailDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def nginx_base_url() -> Generator[str, None, None]:
|
||||
"""
|
||||
The base URL for the nginx HTTP server we expect to be alive
|
||||
"""
|
||||
yield "http://localhost:8080"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tesseract parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tesseract_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the tesseract parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/tesseract/``
|
||||
"""
|
||||
return samples_dir / "tesseract"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def document_webp_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a WebP document sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/document.webp``.
|
||||
"""
|
||||
return tesseract_samples_dir / "document.webp"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def encrypted_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to an encrypted PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/encrypted.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "encrypted.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page digital PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-digital.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-digital.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_images_alpha_rgb_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page TIFF with alpha channel in RGB.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-images-alpha-rgb.tiff``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-images-alpha-rgb.tiff"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_images_alpha_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page TIFF with alpha channel.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-images-alpha.tiff``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-images-alpha.tiff"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_images_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page PDF with images.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-images.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-images.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_images_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page TIFF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-images.tiff``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-images.tiff"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page mixed PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-mixed.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-mixed.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def no_text_alpha_png_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a PNG with alpha channel and no text.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/no-text-alpha.png``.
|
||||
"""
|
||||
return tesseract_samples_dir / "no-text-alpha.png"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def rotated_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a rotated PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/rotated.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "rotated.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def rtl_test_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to an RTL test PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/rtl-test.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "rtl-test.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def signed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a signed PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/signed.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "signed.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_alpha_png_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple PNG with alpha channel.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple-alpha.png``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple-alpha.png"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple digital PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple-digital.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple-digital.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_no_dpi_png_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple PNG without DPI information.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple-no-dpi.png``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple-no-dpi.png"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_bmp_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple BMP sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.bmp``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.bmp"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_gif_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple GIF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.gif``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.gif"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_heic_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple HEIC sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.heic``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.heic"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_jpg_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple JPG sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.jpg``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.jpg"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_png_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple PNG sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.png``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.png"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_tif_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple TIF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.tif``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.tif"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def single_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a single-page mixed PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/single-page-mixed.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "single-page-mixed.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def with_form_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a PDF with form sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/with-form.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "with-form.pdf"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tesseract parser instance and settings helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def null_app_config(mocker: MockerFixture) -> MagicMock:
|
||||
"""Return a MagicMock with all OcrConfig fields set to None.
|
||||
|
||||
This allows the parser to fall back to Django settings instead of
|
||||
hitting the database.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MagicMock
|
||||
Mock config with all fields as None
|
||||
"""
|
||||
return mocker.MagicMock(
|
||||
output_type=None,
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
skip_archive_file=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
rotate_pages=None,
|
||||
rotate_pages_threshold=None,
|
||||
max_image_pixels=None,
|
||||
color_conversion_strategy=None,
|
||||
user_args=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tesseract_parser(
|
||||
mocker: MockerFixture,
|
||||
null_app_config: MagicMock,
|
||||
) -> Generator[RasterisedDocumentParser, None, None]:
|
||||
"""Yield a RasterisedDocumentParser and clean up its temporary directory afterwards.
|
||||
|
||||
Patches the config system to avoid database access.
|
||||
|
||||
Yields
|
||||
------
|
||||
RasterisedDocumentParser
|
||||
A ready-to-use parser instance.
|
||||
"""
|
||||
mocker.patch(
|
||||
"paperless.config.BaseConfig._get_config_instance",
|
||||
return_value=null_app_config,
|
||||
)
|
||||
with RasterisedDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def make_tesseract_parser(
|
||||
mocker: MockerFixture,
|
||||
null_app_config: MagicMock,
|
||||
) -> MakeTesseractParser:
|
||||
"""Return a factory for creating RasterisedDocumentParser with Django settings overrides.
|
||||
|
||||
This fixture is useful for tests that need to create parsers with different
|
||||
settings configurations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Callable[..., contextmanager[RasterisedDocumentParser]]
|
||||
A context manager factory that accepts Django settings overrides
|
||||
"""
|
||||
mocker.patch(
|
||||
"paperless.config.BaseConfig._get_config_instance",
|
||||
return_value=null_app_config,
|
||||
)
|
||||
|
||||
@contextmanager
|
||||
def _make_parser(**django_settings_overrides):
|
||||
with override_settings(**django_settings_overrides):
|
||||
with RasterisedDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
return _make_parser
|
||||
|
||||
@@ -12,7 +12,64 @@ from pytest_httpx import HTTPXMock
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
|
||||
class TestMailParserProtocol:
|
||||
"""Verify that MailDocumentParser satisfies the ParserProtocol contract."""
|
||||
|
||||
def test_isinstance_satisfies_protocol(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
) -> None:
|
||||
assert isinstance(mail_parser, ParserProtocol)
|
||||
|
||||
def test_supported_mime_types(self) -> None:
|
||||
mime_types = MailDocumentParser.supported_mime_types()
|
||||
assert isinstance(mime_types, dict)
|
||||
assert "message/rfc822" in mime_types
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("mime_type", "expected"),
|
||||
[
|
||||
("message/rfc822", 10),
|
||||
("application/pdf", None),
|
||||
("text/plain", None),
|
||||
],
|
||||
)
|
||||
def test_score(self, mime_type: str, expected: int | None) -> None:
|
||||
assert MailDocumentParser.score(mime_type, "email.eml") == expected
|
||||
|
||||
def test_can_produce_archive_is_false(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
) -> None:
|
||||
assert mail_parser.can_produce_archive is False
|
||||
|
||||
def test_requires_pdf_rendition_is_true(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
) -> None:
|
||||
assert mail_parser.requires_pdf_rendition is True
|
||||
|
||||
def test_get_page_count_returns_none_without_archive(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
html_email_file: Path,
|
||||
) -> None:
|
||||
assert mail_parser.get_page_count(html_email_file, "message/rfc822") is None
|
||||
|
||||
def test_get_page_count_returns_int_with_pdf_archive(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
simple_txt_email_pdf_file: Path,
|
||||
) -> None:
|
||||
mail_parser._archive_path = simple_txt_email_pdf_file
|
||||
count = mail_parser.get_page_count(simple_txt_email_pdf_file, "message/rfc822")
|
||||
assert isinstance(count, int)
|
||||
assert count > 0
|
||||
|
||||
|
||||
class TestEmailFileParsing:
|
||||
@@ -24,7 +81,7 @@ class TestEmailFileParsing:
|
||||
def test_parse_error_missing_file(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
sample_dir: Path,
|
||||
mail_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -35,7 +92,7 @@ class TestEmailFileParsing:
|
||||
- An Exception is thrown
|
||||
"""
|
||||
# Check if exception is raised when parsing fails.
|
||||
test_file = sample_dir / "doesntexist.eml"
|
||||
test_file = mail_samples_dir / "doesntexist.eml"
|
||||
|
||||
assert not test_file.exists()
|
||||
|
||||
@@ -246,12 +303,12 @@ class TestEmailThumbnailGenerate:
|
||||
"""
|
||||
mocked_return = "Passing the return value through.."
|
||||
mock_make_thumbnail_from_pdf = mocker.patch(
|
||||
"paperless_mail.parsers.make_thumbnail_from_pdf",
|
||||
"paperless.parsers.mail.make_thumbnail_from_pdf",
|
||||
)
|
||||
mock_make_thumbnail_from_pdf.return_value = mocked_return
|
||||
|
||||
mock_generate_pdf = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||
)
|
||||
mock_generate_pdf.return_value = "Mocked return value.."
|
||||
|
||||
@@ -260,8 +317,7 @@ class TestEmailThumbnailGenerate:
|
||||
mock_generate_pdf.assert_called_once()
|
||||
mock_make_thumbnail_from_pdf.assert_called_once_with(
|
||||
"Mocked return value..",
|
||||
mail_parser.tempdir,
|
||||
None,
|
||||
mail_parser._tempdir,
|
||||
)
|
||||
|
||||
assert mocked_return == thumb
|
||||
@@ -373,7 +429,7 @@ class TestParser:
|
||||
"""
|
||||
# Validate parsing returns the expected results
|
||||
mock_generate_pdf = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||
)
|
||||
|
||||
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
||||
@@ -385,7 +441,7 @@ class TestParser:
|
||||
"BCC: fdf@fvf.de\n\n"
|
||||
"\n\nThis is just a simple Text Mail."
|
||||
)
|
||||
assert text_expected == mail_parser.text
|
||||
assert text_expected == mail_parser.get_text()
|
||||
assert (
|
||||
datetime.datetime(
|
||||
2022,
|
||||
@@ -396,7 +452,7 @@ class TestParser:
|
||||
43,
|
||||
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
||||
)
|
||||
== mail_parser.date
|
||||
== mail_parser.get_date()
|
||||
)
|
||||
|
||||
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
|
||||
@@ -419,7 +475,7 @@ class TestParser:
|
||||
"""
|
||||
|
||||
mock_generate_pdf = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||
)
|
||||
|
||||
# Validate parsing returns the expected results
|
||||
@@ -443,7 +499,7 @@ class TestParser:
|
||||
mail_parser.parse(html_email_file, "message/rfc822")
|
||||
|
||||
mock_generate_pdf.assert_called_once()
|
||||
assert text_expected == mail_parser.text
|
||||
assert text_expected == mail_parser.get_text()
|
||||
assert (
|
||||
datetime.datetime(
|
||||
2022,
|
||||
@@ -454,7 +510,7 @@ class TestParser:
|
||||
19,
|
||||
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
||||
)
|
||||
== mail_parser.date
|
||||
== mail_parser.get_date()
|
||||
)
|
||||
|
||||
def test_generate_pdf_parse_error(
|
||||
@@ -501,7 +557,7 @@ class TestParser:
|
||||
|
||||
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
||||
|
||||
assert mail_parser.archive_path is not None
|
||||
assert mail_parser.get_archive_path() is not None
|
||||
|
||||
@pytest.mark.httpx_mock(can_send_already_matched_responses=True)
|
||||
def test_generate_pdf_html_email(
|
||||
@@ -542,7 +598,7 @@ class TestParser:
|
||||
)
|
||||
mail_parser.parse(html_email_file, "message/rfc822")
|
||||
|
||||
assert mail_parser.archive_path is not None
|
||||
assert mail_parser.get_archive_path() is not None
|
||||
|
||||
def test_generate_pdf_html_email_html_to_pdf_failure(
|
||||
self,
|
||||
@@ -712,10 +768,10 @@ class TestParser:
|
||||
|
||||
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
|
||||
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
|
||||
mail_parser.configure(ParserContext(mailrule_id=1))
|
||||
mail_parser.parse(
|
||||
document_path=html_email_file,
|
||||
mime_type="message/rfc822",
|
||||
mailrule_id=1,
|
||||
)
|
||||
args, _ = mock_merge_route.call_args
|
||||
assert len(args[0]) == expected_calls
|
||||
@@ -11,7 +11,7 @@ from PIL import Image
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from documents.tests.utils import util_call_with_backoff
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
|
||||
def extract_text(pdf_path: Path) -> str:
|
||||
@@ -159,7 +159,7 @@ class TestParserLive:
|
||||
- The returned thumbnail image file shall match the expected hash
|
||||
"""
|
||||
mock_generate_pdf = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||
)
|
||||
mock_generate_pdf.return_value = simple_txt_email_pdf_file
|
||||
|
||||
@@ -216,10 +216,10 @@ class TestParserLive:
|
||||
- The merged PDF shall contain text from both source PDFs
|
||||
"""
|
||||
mock_generate_pdf_from_html = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_html",
|
||||
)
|
||||
mock_generate_pdf_from_mail = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_mail",
|
||||
)
|
||||
mock_generate_pdf_from_mail.return_value = merged_pdf_first
|
||||
mock_generate_pdf_from_html.return_value = merged_pdf_second
|
||||
497
src/paperless/tests/parsers/test_remote_parser.py
Normal file
@@ -0,0 +1,497 @@
|
||||
"""
|
||||
Tests for paperless.parsers.remote.RemoteDocumentParser.
|
||||
|
||||
All tests use the context-manager protocol for parser lifecycle.
|
||||
|
||||
Fixture layout
|
||||
--------------
|
||||
make_azure_mock — factory (defined here; specific to this module)
|
||||
azure_client — composes azure_settings + make_azure_mock + patch;
|
||||
use when a test needs the client to succeed
|
||||
failing_azure_client
|
||||
— composes azure_settings + patch with RuntimeError;
|
||||
use when a test needs the client to fail
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-local fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_AZURE_CLIENT_TARGET = "azure.ai.documentintelligence.DocumentIntelligenceClient"
|
||||
_DEFAULT_TEXT = "Extracted text."
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def make_azure_mock() -> Callable[[str], Mock]:
|
||||
"""Return a factory that builds a mock Azure DocumentIntelligenceClient.
|
||||
|
||||
Usage::
|
||||
|
||||
mock_client = make_azure_mock() # default extracted text
|
||||
mock_client = make_azure_mock("My text.") # custom extracted text
|
||||
"""
|
||||
|
||||
def _factory(text: str = _DEFAULT_TEXT) -> Mock:
|
||||
mock_client = Mock()
|
||||
mock_poller = Mock()
|
||||
mock_poller.wait.return_value = None
|
||||
mock_poller.details = {"operation_id": "fake-op-id"}
|
||||
mock_poller.result.return_value.content = text
|
||||
mock_client.begin_analyze_document.return_value = mock_poller
|
||||
mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
|
||||
return mock_client
|
||||
|
||||
return _factory
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def azure_client(
|
||||
azure_settings: SettingsWrapper,
|
||||
make_azure_mock: Callable[[str], Mock],
|
||||
mocker: MockerFixture,
|
||||
) -> Mock:
|
||||
"""Patch the Azure DI client with a succeeding mock and return the instance.
|
||||
|
||||
Implicitly applies ``azure_settings`` so tests using this fixture do not
|
||||
also need ``@pytest.mark.usefixtures("azure_settings")``.
|
||||
"""
|
||||
mock_client = make_azure_mock()
|
||||
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||
return mock_client
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def failing_azure_client(
|
||||
azure_settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> Mock:
|
||||
"""Patch the Azure DI client to raise RuntimeError on every call.
|
||||
|
||||
Implicitly applies ``azure_settings``. Returns the mock instance so
|
||||
tests can assert on calls such as ``close()``.
|
||||
"""
|
||||
mock_client = Mock()
|
||||
mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
|
||||
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||
return mock_client
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Protocol contract
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserProtocol:
|
||||
"""Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""
|
||||
|
||||
def test_isinstance_satisfies_protocol(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert isinstance(remote_parser, ParserProtocol)
|
||||
|
||||
def test_class_attributes_present(self) -> None:
|
||||
assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
|
||||
assert (
|
||||
isinstance(RemoteDocumentParser.version, str)
|
||||
and RemoteDocumentParser.version
|
||||
)
|
||||
assert (
|
||||
isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
|
||||
)
|
||||
assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# supported_mime_types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserSupportedMimeTypes:
|
||||
"""supported_mime_types() always returns the full set regardless of config."""
|
||||
|
||||
def test_returns_dict(self) -> None:
|
||||
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||
assert isinstance(mime_types, dict)
|
||||
|
||||
def test_includes_all_expected_types(self) -> None:
|
||||
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||
expected = {
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
}
|
||||
assert expected == set(mime_types.keys())
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
def test_returns_full_set_when_not_configured(self) -> None:
|
||||
"""
|
||||
GIVEN: No remote engine is configured
|
||||
WHEN: supported_mime_types() is called
|
||||
THEN: The full MIME type dict is still returned (score() handles activation)
|
||||
"""
|
||||
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||
assert len(mime_types) == 7
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# score()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserScore:
|
||||
"""score() encodes the activation logic: None when unconfigured, 20 when active."""
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
@pytest.mark.parametrize(
|
||||
"mime_type",
|
||||
[
|
||||
pytest.param("application/pdf", id="pdf"),
|
||||
pytest.param("image/png", id="png"),
|
||||
pytest.param("image/jpeg", id="jpeg"),
|
||||
pytest.param("image/tiff", id="tiff"),
|
||||
pytest.param("image/bmp", id="bmp"),
|
||||
pytest.param("image/gif", id="gif"),
|
||||
pytest.param("image/webp", id="webp"),
|
||||
],
|
||||
)
|
||||
def test_score_returns_20_when_configured(self, mime_type: str) -> None:
|
||||
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||
assert result == 20
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
@pytest.mark.parametrize(
|
||||
"mime_type",
|
||||
[
|
||||
pytest.param("application/pdf", id="pdf"),
|
||||
pytest.param("image/png", id="png"),
|
||||
pytest.param("image/jpeg", id="jpeg"),
|
||||
],
|
||||
)
|
||||
def test_score_returns_none_when_no_engine(self, mime_type: str) -> None:
|
||||
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
def test_score_returns_none_when_api_key_missing(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = None
|
||||
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
def test_score_returns_none_when_endpoint_missing(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = "key"
|
||||
settings.REMOTE_OCR_ENDPOINT = None
|
||||
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
def test_score_returns_none_for_unsupported_mime_type(self) -> None:
|
||||
result = RemoteDocumentParser.score("text/plain", "doc.txt")
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
def test_score_higher_than_tesseract_default(self) -> None:
|
||||
"""Remote parser (20) outranks the tesseract default (10) when configured."""
|
||||
score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert score is not None and score > 10
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Properties
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserProperties:
|
||||
def test_can_produce_archive_is_true(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert remote_parser.can_produce_archive is True
|
||||
|
||||
def test_requires_pdf_rendition_is_false(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert remote_parser.requires_pdf_rendition is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserLifecycle:
|
||||
def test_context_manager_cleans_up_tempdir(self) -> None:
|
||||
with RemoteDocumentParser() as parser:
|
||||
tempdir = parser._tempdir
|
||||
assert tempdir.exists()
|
||||
assert not tempdir.exists()
|
||||
|
||||
def test_context_manager_cleans_up_after_exception(self) -> None:
|
||||
tempdir: Path | None = None
|
||||
with pytest.raises(RuntimeError):
|
||||
with RemoteDocumentParser() as parser:
|
||||
tempdir = parser._tempdir
|
||||
raise RuntimeError("boom")
|
||||
assert tempdir is not None
|
||||
assert not tempdir.exists()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse() — happy path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserParse:
|
||||
def test_parse_returns_text_from_azure(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() == _DEFAULT_TEXT
|
||||
|
||||
def test_parse_sets_archive_path(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
archive = remote_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
assert archive.exists()
|
||||
assert archive.suffix == ".pdf"
|
||||
|
||||
def test_parse_closes_client_on_success(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.configure(ParserContext())
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
azure_client.close.assert_called_once()
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
def test_parse_sets_empty_text_when_not_configured(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() == ""
|
||||
assert remote_parser.get_archive_path() is None
|
||||
|
||||
def test_get_text_none_before_parse(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert remote_parser.get_text() is None
|
||||
|
||||
def test_get_date_always_none(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_date() is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse() — Azure failure path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserParseError:
|
||||
def test_parse_returns_none_on_azure_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() is None
|
||||
|
||||
def test_parse_closes_client_on_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
failing_azure_client.close.assert_called_once()
|
||||
|
||||
def test_parse_logs_error_on_azure_failure(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
mock_log = mocker.patch("paperless.parsers.remote.logger")
|
||||
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
mock_log.error.assert_called_once()
|
||||
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_page_count()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserPageCount:
|
||||
def test_page_count_for_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||
assert isinstance(count, int)
|
||||
assert count >= 1
|
||||
|
||||
def test_page_count_returns_none_for_image_mime(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
count = remote_parser.get_page_count(sample_pdf_file, "image/png")
|
||||
assert count is None
|
||||
|
||||
def test_page_count_returns_none_for_invalid_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
bad_pdf = tmp_path / "bad.pdf"
|
||||
bad_pdf.write_bytes(b"not a pdf at all")
|
||||
count = remote_parser.get_page_count(bad_pdf, "application/pdf")
|
||||
assert count is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# extract_metadata()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserMetadata:
|
||||
def test_extract_metadata_non_pdf_returns_empty(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
|
||||
assert result == []
|
||||
|
||||
def test_extract_metadata_pdf_returns_list(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||
assert isinstance(result, list)
|
||||
|
||||
def test_extract_metadata_pdf_entries_have_required_keys(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||
for entry in result:
|
||||
assert "namespace" in entry
|
||||
assert "prefix" in entry
|
||||
assert "key" in entry
|
||||
assert "value" in entry
|
||||
assert isinstance(entry["value"], str)
|
||||
|
||||
def test_extract_metadata_does_not_raise_on_invalid_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
bad_pdf = tmp_path / "bad.pdf"
|
||||
bad_pdf.write_bytes(b"not a pdf at all")
|
||||
result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
|
||||
assert result == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry integration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserRegistry:
|
||||
def test_registered_in_defaults(self) -> None:
|
||||
from paperless.parsers.registry import ParserRegistry
|
||||
|
||||
registry = ParserRegistry()
|
||||
registry.register_defaults()
|
||||
|
||||
assert RemoteDocumentParser in registry._builtins
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
def test_get_parser_returns_remote_when_configured(self) -> None:
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
registry = get_parser_registry()
|
||||
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||
|
||||
assert parser_cls is RemoteDocumentParser
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
def test_get_parser_returns_none_for_unsupported_type_when_not_configured(
|
||||
self,
|
||||
) -> None:
|
||||
"""With remote off and a truly unsupported MIME type, registry returns None."""
|
||||
from paperless.parsers.registry import ParserRegistry
|
||||
|
||||
registry = ParserRegistry()
|
||||
registry.register_defaults()
|
||||
parser_cls = registry.get_parser_for_file(
|
||||
"application/x-unknown-format",
|
||||
"doc.xyz",
|
||||
)
|
||||
|
||||
assert parser_cls is None
|
||||
@@ -10,7 +10,7 @@ from paperless.models import CleanChoices
|
||||
from paperless.models import ColorConvertChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.models import OutputTypeChoices
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
1174
src/paperless/tests/parsers/test_tesseract_parser.py
Normal file
@@ -12,6 +12,7 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
@@ -93,6 +94,7 @@ class TestTextParserParse:
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
text_parser.configure(ParserContext())
|
||||
text_parser.parse(sample_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_text() == "This is a test file.\n"
|
||||
@@ -102,6 +104,7 @@ class TestTextParserParse:
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
text_parser.configure(ParserContext())
|
||||
text_parser.parse(sample_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_archive_path() is None
|
||||
@@ -111,6 +114,7 @@ class TestTextParserParse:
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
text_parser.configure(ParserContext())
|
||||
text_parser.parse(sample_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_date() is None
|
||||
@@ -129,6 +133,7 @@ class TestTextParserParse:
|
||||
- Parsing succeeds
|
||||
- Invalid bytes are replaced with the Unicode replacement character
|
||||
"""
|
||||
text_parser.configure(ParserContext())
|
||||
text_parser.parse(malformed_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_text() == "Pantothens\ufffdure\n"
|
||||
@@ -251,6 +256,9 @@ class TestTextParserRegistry:
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
registry = get_parser_registry()
|
||||
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||
parser_cls = registry.get_parser_for_file(
|
||||
"application/x-unknown-format",
|
||||
"doc.xyz",
|
||||
)
|
||||
|
||||
assert parser_cls is None
|
||||
|
||||
@@ -9,6 +9,7 @@ from pytest_django.fixtures import SettingsWrapper
|
||||
from pytest_httpx import HTTPXMock
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
@@ -60,6 +61,29 @@ class TestTikaParserRegistryInterface:
|
||||
def test_requires_pdf_rendition_is_true(self) -> None:
|
||||
assert TikaDocumentParser().requires_pdf_rendition is True
|
||||
|
||||
def test_get_page_count_returns_none_without_archive(
|
||||
self,
|
||||
tika_parser: TikaDocumentParser,
|
||||
sample_odt_file: Path,
|
||||
) -> None:
|
||||
assert (
|
||||
tika_parser.get_page_count(
|
||||
sample_odt_file,
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
)
|
||||
is None
|
||||
)
|
||||
|
||||
def test_get_page_count_returns_int_with_pdf_archive(
|
||||
self,
|
||||
tika_parser: TikaDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
tika_parser._archive_path = sample_pdf_file
|
||||
count = tika_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||
assert isinstance(count, int)
|
||||
assert count > 0
|
||||
|
||||
|
||||
@pytest.mark.django_db()
|
||||
class TestTikaParser:
|
||||
@@ -83,6 +107,7 @@ class TestTikaParser:
|
||||
# Pretend convert to PDF response
|
||||
httpx_mock.add_response(content=b"PDF document")
|
||||
|
||||
tika_parser.configure(ParserContext())
|
||||
tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
|
||||
|
||||
assert tika_parser.get_text() == "the content"
|
||||
|
||||
|
Before Width: | Height: | Size: 6.0 KiB After Width: | Height: | Size: 6.0 KiB |
|
Before Width: | Height: | Size: 2.8 KiB After Width: | Height: | Size: 2.8 KiB |
|
Before Width: | Height: | Size: 6.9 KiB After Width: | Height: | Size: 6.9 KiB |
|
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.2 KiB |
|
Before Width: | Height: | Size: 5.7 KiB After Width: | Height: | Size: 5.7 KiB |
|
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
|
Before Width: | Height: | Size: 8.2 KiB After Width: | Height: | Size: 8.2 KiB |
|
Before Width: | Height: | Size: 6.8 KiB After Width: | Height: | Size: 6.8 KiB |
|
Before Width: | Height: | Size: 1.7 MiB After Width: | Height: | Size: 1.7 MiB |
|
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 18 KiB |
|
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
|
Before Width: | Height: | Size: 7.2 KiB After Width: | Height: | Size: 7.2 KiB |
@@ -18,6 +18,7 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import ParserRegistry
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
@@ -103,6 +104,11 @@ def dummy_parser_cls() -> type:
|
||||
) -> list:
|
||||
return []
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
"""
|
||||
Required to exist, but doesn't need to do anything
|
||||
"""
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
@@ -144,6 +150,7 @@ class TestParserProtocol:
|
||||
@pytest.mark.parametrize(
|
||||
"missing_method",
|
||||
[
|
||||
pytest.param("configure", id="missing-configure"),
|
||||
pytest.param("parse", id="missing-parse"),
|
||||
pytest.param("get_text", id="missing-get_text"),
|
||||
pytest.param("get_thumbnail", id="missing-get_thumbnail"),
|
||||
|
||||
10
src/paperless_ai/tests/conftest.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_llm_index_dir(tmp_path: Path, settings: SettingsWrapper):
|
||||
settings.LLM_INDEX_DIR = tmp_path
|
||||
return tmp_path
|
||||
@@ -13,14 +13,6 @@ from documents.models import PaperlessTask
|
||||
from paperless_ai import indexing
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_llm_index_dir(tmp_path):
|
||||
original_dir = indexing.settings.LLM_INDEX_DIR
|
||||
indexing.settings.LLM_INDEX_DIR = tmp_path
|
||||
yield tmp_path
|
||||
indexing.settings.LLM_INDEX_DIR = original_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def real_document(db):
|
||||
return Document.objects.create(
|
||||
|
||||
@@ -3,7 +3,6 @@ from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from django.conf import settings
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.models import LLMEmbeddingBackend
|
||||
@@ -19,14 +18,6 @@ def mock_ai_config():
|
||||
yield MockAIConfig
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_llm_index_dir(tmp_path):
|
||||
original_dir = settings.LLM_INDEX_DIR
|
||||
settings.LLM_INDEX_DIR = tmp_path
|
||||
yield tmp_path
|
||||
settings.LLM_INDEX_DIR = original_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_document():
|
||||
doc = MagicMock(spec=Document)
|
||||
|
||||
@@ -1,481 +0,0 @@
|
||||
import re
|
||||
from html import escape
|
||||
from pathlib import Path
|
||||
|
||||
from bleach import clean
|
||||
from bleach import linkify
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.utils.timezone import is_naive
|
||||
from django.utils.timezone import make_aware
|
||||
from gotenberg_client import GotenbergClient
|
||||
from gotenberg_client.constants import A4
|
||||
from gotenberg_client.options import Measurement
|
||||
from gotenberg_client.options import MeasurementUnitType
|
||||
from gotenberg_client.options import PageMarginsType
|
||||
from gotenberg_client.options import PdfAFormat
|
||||
from humanize import naturalsize
|
||||
from imap_tools import MailAttachment
|
||||
from imap_tools import MailMessage
|
||||
from tika_client import TikaClient
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless.models import OutputTypeChoices
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
|
||||
class MailDocumentParser(DocumentParser):
|
||||
"""
|
||||
This parser uses imap_tools to parse .eml files, generates pdf using
|
||||
Gotenberg and sends the html part to a Tika server for text extraction.
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.mail"
|
||||
|
||||
def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
|
||||
"""
|
||||
Converts our requested PDF/A output into the Gotenberg API
|
||||
format
|
||||
"""
|
||||
if settings.OCR_OUTPUT_TYPE in {
|
||||
OutputTypeChoices.PDF_A,
|
||||
OutputTypeChoices.PDF_A2,
|
||||
}:
|
||||
return PdfAFormat.A2b
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
|
||||
self.log.warning(
|
||||
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||
)
|
||||
return PdfAFormat.A2b
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
|
||||
return PdfAFormat.A3b
|
||||
return None
|
||||
|
||||
def get_thumbnail(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
file_name=None,
|
||||
) -> Path:
|
||||
if not self.archive_path:
|
||||
self.archive_path = self.generate_pdf(
|
||||
self.parse_file_to_message(document_path),
|
||||
)
|
||||
|
||||
return make_thumbnail_from_pdf(
|
||||
self.archive_path,
|
||||
self.tempdir,
|
||||
self.logging_group,
|
||||
)
|
||||
|
||||
def extract_metadata(self, document_path: Path, mime_type: str):
|
||||
result = []
|
||||
|
||||
try:
|
||||
mail = self.parse_file_to_message(document_path)
|
||||
except ParseError as e:
|
||||
self.log.warning(
|
||||
f"Error while fetching document metadata for {document_path}: {e}",
|
||||
)
|
||||
return result
|
||||
|
||||
for key, value in mail.headers.items():
|
||||
value = ", ".join(i for i in value)
|
||||
try:
|
||||
value.encode("utf-8")
|
||||
except UnicodeEncodeError as e: # pragma: no cover
|
||||
self.log.debug(f"Skipping header {key}: {e}")
|
||||
continue
|
||||
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "header",
|
||||
"key": key,
|
||||
"value": value,
|
||||
},
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": "attachments",
|
||||
"value": ", ".join(
|
||||
f"{attachment.filename}"
|
||||
f"({naturalsize(attachment.size, binary=True, format='%.2f')})"
|
||||
for attachment in mail.attachments
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": "date",
|
||||
"value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
|
||||
},
|
||||
)
|
||||
|
||||
result.sort(key=lambda item: (item["prefix"], item["key"]))
|
||||
return result
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
file_name=None,
|
||||
mailrule_id: int | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Parses the given .eml into formatted text, based on the decoded email.
|
||||
|
||||
"""
|
||||
|
||||
def strip_text(text: str):
|
||||
"""
|
||||
Reduces the spacing of the given text string
|
||||
"""
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
text = re.sub(r"(\n *)+", "\n", text)
|
||||
return text.strip()
|
||||
|
||||
def build_formatted_text(mail_message: MailMessage) -> str:
|
||||
"""
|
||||
Constructs a formatted string, based on the given email. Basically tries
|
||||
to get most of the email content, included front matter, into a nice string
|
||||
"""
|
||||
fmt_text = f"Subject: {mail_message.subject}\n\n"
|
||||
fmt_text += f"From: {mail_message.from_values.full}\n\n"
|
||||
to_list = [address.full for address in mail_message.to_values]
|
||||
fmt_text += f"To: {', '.join(to_list)}\n\n"
|
||||
if mail_message.cc_values:
|
||||
fmt_text += (
|
||||
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
|
||||
)
|
||||
if mail_message.bcc_values:
|
||||
fmt_text += (
|
||||
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
|
||||
)
|
||||
if mail_message.attachments:
|
||||
att = []
|
||||
for a in mail.attachments:
|
||||
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
|
||||
att.append(
|
||||
f"{a.filename} ({attachment_size})",
|
||||
)
|
||||
fmt_text += f"Attachments: {', '.join(att)}\n\n"
|
||||
|
||||
if mail.html:
|
||||
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
|
||||
|
||||
fmt_text += f"\n\n{strip_text(mail.text)}"
|
||||
|
||||
return fmt_text
|
||||
|
||||
self.log.debug(f"Parsing file {document_path.name} into an email")
|
||||
mail = self.parse_file_to_message(document_path)
|
||||
|
||||
self.log.debug("Building formatted text from email")
|
||||
self.text = build_formatted_text(mail)
|
||||
|
||||
if is_naive(mail.date):
|
||||
self.date = make_aware(mail.date)
|
||||
else:
|
||||
self.date = mail.date
|
||||
|
||||
self.log.debug("Creating a PDF from the email")
|
||||
if mailrule_id:
|
||||
rule = MailRule.objects.get(pk=mailrule_id)
|
||||
self.archive_path = self.generate_pdf(mail, rule.pdf_layout)
|
||||
else:
|
||||
self.archive_path = self.generate_pdf(mail)
|
||||
|
||||
@staticmethod
|
||||
def parse_file_to_message(filepath: Path) -> MailMessage:
|
||||
"""
|
||||
Parses the given .eml file into a MailMessage object
|
||||
"""
|
||||
try:
|
||||
with filepath.open("rb") as eml:
|
||||
parsed = MailMessage.from_bytes(eml.read())
|
||||
if parsed.from_values is None:
|
||||
raise ParseError(
|
||||
f"Could not parse {filepath}: Missing 'from'",
|
||||
)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse {filepath}: {err}",
|
||||
) from err
|
||||
|
||||
return parsed
|
||||
|
||||
def tika_parse(self, html: str):
|
||||
self.log.info("Sending content to Tika server")
|
||||
|
||||
try:
|
||||
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
||||
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
||||
|
||||
if parsed.content is not None:
|
||||
return parsed.content.strip()
|
||||
return ""
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse content with tika server at "
|
||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||
) from err
|
||||
|
||||
def generate_pdf(
|
||||
self,
|
||||
mail_message: MailMessage,
|
||||
pdf_layout: MailRule.PdfLayout | None = None,
|
||||
) -> Path:
|
||||
archive_path = Path(self.tempdir) / "merged.pdf"
|
||||
|
||||
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
|
||||
|
||||
pdf_layout = (
|
||||
pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
|
||||
) # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout
|
||||
|
||||
# If no HTML content, create the PDF from the message
|
||||
# Otherwise, create 2 PDFs and merge them with Gotenberg
|
||||
if not mail_message.html:
|
||||
archive_path.write_bytes(mail_pdf_file.read_bytes())
|
||||
else:
|
||||
pdf_of_html_content = self.generate_pdf_from_html(
|
||||
mail_message.html,
|
||||
mail_message.attachments,
|
||||
)
|
||||
|
||||
self.log.debug("Merging email text and HTML content into single PDF")
|
||||
|
||||
with (
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client,
|
||||
client.merge.merge() as route,
|
||||
):
|
||||
# Configure requested PDF/A formatting, if any
|
||||
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||
if pdf_a_format is not None:
|
||||
route.pdf_format(pdf_a_format)
|
||||
|
||||
match pdf_layout:
|
||||
case MailRule.PdfLayout.HTML_TEXT:
|
||||
route.merge([pdf_of_html_content, mail_pdf_file])
|
||||
case MailRule.PdfLayout.HTML_ONLY:
|
||||
route.merge([pdf_of_html_content])
|
||||
case MailRule.PdfLayout.TEXT_ONLY:
|
||||
route.merge([mail_pdf_file])
|
||||
case MailRule.PdfLayout.TEXT_HTML | _:
|
||||
route.merge([mail_pdf_file, pdf_of_html_content])
|
||||
|
||||
try:
|
||||
response = route.run()
|
||||
archive_path.write_bytes(response.content)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while merging email HTML into PDF: {err}",
|
||||
) from err
|
||||
|
||||
return archive_path
|
||||
|
||||
def mail_to_html(self, mail: MailMessage) -> Path:
|
||||
"""
|
||||
Converts the given email into an HTML file, formatted
|
||||
based on the given template
|
||||
"""
|
||||
|
||||
def clean_html(text: str) -> str:
|
||||
"""
|
||||
Attempts to clean, escape and linkify the given HTML string
|
||||
"""
|
||||
if isinstance(text, list):
|
||||
text = "\n".join([str(e) for e in text])
|
||||
if not isinstance(text, str):
|
||||
text = str(text)
|
||||
text = escape(text)
|
||||
text = clean(text)
|
||||
text = linkify(text, parse_email=True)
|
||||
text = text.replace("\n", "<br>")
|
||||
return text
|
||||
|
||||
data = {}
|
||||
|
||||
data["subject"] = clean_html(mail.subject)
|
||||
if data["subject"]:
|
||||
data["subject_label"] = "Subject"
|
||||
data["from"] = clean_html(mail.from_values.full)
|
||||
if data["from"]:
|
||||
data["from_label"] = "From"
|
||||
data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
|
||||
if data["to"]:
|
||||
data["to_label"] = "To"
|
||||
data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
|
||||
if data["cc"]:
|
||||
data["cc_label"] = "CC"
|
||||
data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
|
||||
if data["bcc"]:
|
||||
data["bcc_label"] = "BCC"
|
||||
|
||||
att = []
|
||||
for a in mail.attachments:
|
||||
att.append(
|
||||
f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
|
||||
)
|
||||
data["attachments"] = clean_html(", ".join(att))
|
||||
if data["attachments"]:
|
||||
data["attachments_label"] = "Attachments"
|
||||
|
||||
data["date"] = clean_html(
|
||||
timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
|
||||
)
|
||||
data["content"] = clean_html(mail.text.strip())
|
||||
|
||||
from django.template.loader import render_to_string
|
||||
|
||||
html_file = Path(self.tempdir) / "email_as_html.html"
|
||||
html_file.write_text(render_to_string("email_msg_template.html", context=data))
|
||||
|
||||
return html_file
|
||||
|
||||
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
|
||||
"""
|
||||
Creates a PDF based on the given email, using the email's values in a
|
||||
an HTML template
|
||||
"""
|
||||
self.log.info("Converting mail to PDF")
|
||||
|
||||
css_file = Path(__file__).parent / "templates" / "output.css"
|
||||
email_html_file = self.mail_to_html(mail)
|
||||
|
||||
with (
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client,
|
||||
client.chromium.html_to_pdf() as route,
|
||||
):
|
||||
# Configure requested PDF/A formatting, if any
|
||||
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||
if pdf_a_format is not None:
|
||||
route.pdf_format(pdf_a_format)
|
||||
|
||||
try:
|
||||
response = (
|
||||
route.index(email_html_file)
|
||||
.resource(css_file)
|
||||
.margins(
|
||||
PageMarginsType(
|
||||
top=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
bottom=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
left=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
right=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
),
|
||||
)
|
||||
.size(A4)
|
||||
.scale(1.0)
|
||||
.run()
|
||||
)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting email to PDF: {err}",
|
||||
) from err
|
||||
|
||||
email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
|
||||
email_as_pdf_file.write_bytes(response.content)
|
||||
|
||||
return email_as_pdf_file
|
||||
|
||||
def generate_pdf_from_html(
|
||||
self,
|
||||
orig_html: str,
|
||||
attachments: list[MailAttachment],
|
||||
) -> Path:
|
||||
"""
|
||||
Generates a PDF file based on the HTML and attachments of the email
|
||||
"""
|
||||
|
||||
def clean_html_script(text: str):
|
||||
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
|
||||
text = compiled_open.sub("<div hidden ", text)
|
||||
|
||||
compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
|
||||
text = compiled_close.sub("</div", text)
|
||||
return text
|
||||
|
||||
self.log.info("Converting message html to PDF")
|
||||
|
||||
tempdir = Path(self.tempdir)
|
||||
|
||||
html_clean = clean_html_script(orig_html)
|
||||
html_clean_file = tempdir / "index.html"
|
||||
html_clean_file.write_text(html_clean)
|
||||
|
||||
with (
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client,
|
||||
client.chromium.html_to_pdf() as route,
|
||||
):
|
||||
# Configure requested PDF/A formatting, if any
|
||||
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||
if pdf_a_format is not None:
|
||||
route.pdf_format(pdf_a_format)
|
||||
|
||||
# Add attachments as resources, cleaning the filename and replacing
|
||||
# it in the index file for inclusion
|
||||
for attachment in attachments:
|
||||
# Clean the attachment name to be valid
|
||||
name_cid = f"cid:{attachment.content_id}"
|
||||
name_clean = "".join(e for e in name_cid if e.isalnum())
|
||||
|
||||
# Write attachment payload to a temp file
|
||||
temp_file = tempdir / name_clean
|
||||
temp_file.write_bytes(attachment.payload)
|
||||
|
||||
route.resource(temp_file)
|
||||
|
||||
# Replace as needed the name with the clean name
|
||||
html_clean = html_clean.replace(name_cid, name_clean)
|
||||
|
||||
# Now store the cleaned up HTML version
|
||||
html_clean_file = tempdir / "index.html"
|
||||
html_clean_file.write_text(html_clean)
|
||||
# This is our index file, the main page basically
|
||||
route.index(html_clean_file)
|
||||
|
||||
# Set page size, margins
|
||||
route.margins(
|
||||
PageMarginsType(
|
||||
top=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
bottom=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
left=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
right=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
),
|
||||
).size(A4).scale(1.0)
|
||||
|
||||
try:
|
||||
response = route.run()
|
||||
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting document to PDF: {err}",
|
||||
) from err
|
||||
|
||||
html_pdf = tempdir / "html.pdf"
|
||||
html_pdf.write_bytes(response.content)
|
||||
return html_pdf
|
||||
|
||||
def get_settings(self) -> None:
|
||||
"""
|
||||
This parser does not implement additional settings yet
|
||||
"""
|
||||
return None
|
||||
@@ -1,7 +1,12 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
return MailDocumentParser(*args, **kwargs)
|
||||
# MailDocumentParser accepts no constructor args in the new-style protocol.
|
||||
# Pop legacy args that arrive from the signal-based consumer path.
|
||||
# Phase 4 will replace this signal path with the ParserRegistry.
|
||||
kwargs.pop("logging_group", None)
|
||||
kwargs.pop("progress_callback", None)
|
||||
return MailDocumentParser()
|
||||
|
||||
|
||||
def mail_consumer_declaration(sender, **kwargs):
|
||||
|
||||
@@ -1,71 +1,9 @@
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless_mail.mail import MailAccountHandler
|
||||
from paperless_mail.models import MailAccount
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_dir() -> Path:
|
||||
return (Path(__file__).parent / Path("samples")).resolve()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def broken_email_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "broken.eml"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_txt_email_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "simple_text.eml"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "simple_text.eml.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "simple_text.eml.pdf.webp"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "html.eml"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_pdf_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "html.eml.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_thumbnail_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "html.eml.pdf.webp"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_html_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "html.eml.html"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def merged_pdf_first(sample_dir: Path) -> Path:
|
||||
return sample_dir / "first.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def merged_pdf_second(sample_dir: Path) -> Path:
|
||||
return sample_dir / "second.pdf"
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mail_parser() -> MailDocumentParser:
|
||||
return MailDocumentParser(logging_group=None)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
@@ -89,11 +27,3 @@ def greenmail_mail_account(db: None) -> Generator[MailAccount, None, None]:
|
||||
@pytest.fixture()
|
||||
def mail_account_handler() -> MailAccountHandler:
|
||||
return MailAccountHandler()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def nginx_base_url() -> Generator[str, None, None]:
|
||||
"""
|
||||
The base URL for the nginx HTTP server we expect to be alive
|
||||
"""
|
||||
yield "http://localhost:8080"
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from datetime import timedelta
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import TestCase
|
||||
@@ -16,6 +15,13 @@ from paperless_mail.models import MailAccount
|
||||
from paperless_mail.oauth import PaperlessMailOAuth2Manager
|
||||
|
||||
|
||||
@override_settings(
|
||||
OAUTH_CALLBACK_BASE_URL="http://localhost:8000",
|
||||
GMAIL_OAUTH_CLIENT_ID="test_gmail_client_id",
|
||||
GMAIL_OAUTH_CLIENT_SECRET="test_gmail_client_secret",
|
||||
OUTLOOK_OAUTH_CLIENT_ID="test_outlook_client_id",
|
||||
OUTLOOK_OAUTH_CLIENT_SECRET="test_outlook_client_secret",
|
||||
)
|
||||
class TestMailOAuth(
|
||||
TestCase,
|
||||
):
|
||||
@@ -31,12 +37,6 @@ class TestMailOAuth(
|
||||
self.user.save()
|
||||
self.client.force_login(self.user)
|
||||
self.mail_account_handler = MailAccountHandler()
|
||||
# Mock settings
|
||||
settings.OAUTH_CALLBACK_BASE_URL = "http://localhost:8000"
|
||||
settings.GMAIL_OAUTH_CLIENT_ID = "test_gmail_client_id"
|
||||
settings.GMAIL_OAUTH_CLIENT_SECRET = "test_gmail_client_secret"
|
||||
settings.OUTLOOK_OAUTH_CLIENT_ID = "test_outlook_client_id"
|
||||
settings.OUTLOOK_OAUTH_CLIENT_SECRET = "test_outlook_client_secret"
|
||||
super().setUp()
|
||||
|
||||
def test_generate_paths(self) -> None:
|
||||
|
||||
@@ -1,118 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class RemoteEngineConfig:
|
||||
def __init__(
|
||||
self,
|
||||
engine: str,
|
||||
api_key: str | None = None,
|
||||
endpoint: str | None = None,
|
||||
):
|
||||
self.engine = engine
|
||||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
|
||||
def engine_is_valid(self):
|
||||
valid = self.engine in ["azureai"] and self.api_key is not None
|
||||
if self.engine == "azureai":
|
||||
valid = valid and self.endpoint is not None
|
||||
return valid
|
||||
|
||||
|
||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
"""
|
||||
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
|
||||
as this is the only service that provides a remote OCR API with text-embedded PDF output.
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.remote"
|
||||
|
||||
def get_settings(self) -> RemoteEngineConfig:
|
||||
"""
|
||||
Returns the configuration for the remote OCR engine, loaded from Django settings.
|
||||
"""
|
||||
return RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
|
||||
def supported_mime_types(self):
|
||||
if self.settings.engine_is_valid():
|
||||
return {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/tiff": ".tiff",
|
||||
"image/bmp": ".bmp",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
else:
|
||||
return {}
|
||||
|
||||
def azure_ai_vision_parse(
|
||||
self,
|
||||
file: Path,
|
||||
) -> str | None:
|
||||
"""
|
||||
Uses Azure AI Vision to parse the document and return the text content.
|
||||
It requests a searchable PDF output with embedded text.
|
||||
The PDF is saved to the archive_path attribute.
|
||||
Returns the text content extracted from the document.
|
||||
If the parsing fails, it returns None.
|
||||
"""
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
||||
from azure.ai.documentintelligence.models import DocumentContentFormat
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
client = DocumentIntelligenceClient(
|
||||
endpoint=self.settings.endpoint,
|
||||
credential=AzureKeyCredential(self.settings.api_key),
|
||||
)
|
||||
|
||||
try:
|
||||
with file.open("rb") as f:
|
||||
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
||||
poller = client.begin_analyze_document(
|
||||
model_id="prebuilt-read",
|
||||
body=analyze_request,
|
||||
output_content_format=DocumentContentFormat.TEXT,
|
||||
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
poller.wait()
|
||||
result_id = poller.details["operation_id"]
|
||||
result = poller.result()
|
||||
|
||||
# Download the PDF with embedded text
|
||||
self.archive_path = self.tempdir / "archive.pdf"
|
||||
with self.archive_path.open("wb") as f:
|
||||
for chunk in client.get_analyze_result_pdf(
|
||||
model_id="prebuilt-read",
|
||||
result_id=result_id,
|
||||
):
|
||||
f.write(chunk)
|
||||
return result.content
|
||||
except Exception as e:
|
||||
self.log.error(f"Azure AI Vision parsing failed: {e}")
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
return None
|
||||
|
||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||
if not self.settings.engine_is_valid():
|
||||
self.log.warning(
|
||||
"No valid remote parser engine is configured, content will be empty.",
|
||||
)
|
||||
self.text = ""
|
||||
elif self.settings.engine == "azureai":
|
||||
self.text = self.azure_ai_vision_parse(document_path)
|
||||
@@ -1,16 +1,36 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
|
||||
# The new RemoteDocumentParser does not accept the progress_callback
|
||||
# kwarg injected by the old signal-based consumer. logging_group is
|
||||
# forwarded as a positional arg.
|
||||
# Phase 4 will replace this signal path with the new ParserRegistry.
|
||||
kwargs.pop("progress_callback", None)
|
||||
return RemoteDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def get_supported_mime_types():
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
def get_supported_mime_types() -> dict[str, str]:
|
||||
from django.conf import settings
|
||||
|
||||
return RemoteDocumentParser(None).supported_mime_types()
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.remote import RemoteEngineConfig
|
||||
|
||||
config = RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
if not config.engine_is_valid():
|
||||
return {}
|
||||
return RemoteDocumentParser.supported_mime_types()
|
||||
|
||||
|
||||
def remote_consumer_declaration(sender, **kwargs):
|
||||
def remote_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 5,
|
||||
|
||||
@@ -1,131 +0,0 @@
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
from paperless_remote.signals import get_parser
|
||||
|
||||
|
||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
||||
|
||||
def assertContainsStrings(self, content: str, strings: list[str]) -> None:
|
||||
# Asserts that all strings appear in content, in the given order.
|
||||
indices = []
|
||||
for s in strings:
|
||||
if s in content:
|
||||
indices.append(content.index(s))
|
||||
else:
|
||||
self.fail(f"'{s}' is not in '{content}'")
|
||||
self.assertListEqual(indices, sorted(indices))
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.run_subprocess")
|
||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
||||
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess) -> None:
|
||||
# Arrange mock Azure client
|
||||
mock_client = mock.Mock()
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
# Simulate poller result and its `.details`
|
||||
mock_poller = mock.Mock()
|
||||
mock_poller.wait.return_value = None
|
||||
mock_poller.details = {"operation_id": "fake-op-id"}
|
||||
mock_client.begin_analyze_document.return_value = mock_poller
|
||||
mock_poller.result.return_value.content = "This is a test document."
|
||||
|
||||
# Return dummy PDF bytes
|
||||
mock_client.get_analyze_result_pdf.return_value = [
|
||||
b"%PDF-",
|
||||
b"1.7 ",
|
||||
b"FAKEPDF",
|
||||
]
|
||||
|
||||
# Simulate pdftotext by writing dummy text to sidecar file
|
||||
def fake_run(cmd, *args, **kwargs) -> None:
|
||||
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
|
||||
f.write("This is a test document.")
|
||||
|
||||
mock_subprocess.side_effect = fake_run
|
||||
|
||||
with override_settings(
|
||||
REMOTE_OCR_ENGINE="azureai",
|
||||
REMOTE_OCR_API_KEY="somekey",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
||||
):
|
||||
parser = get_parser(uuid.uuid4())
|
||||
parser.parse(
|
||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.text.strip(),
|
||||
["This is a test document."],
|
||||
)
|
||||
|
||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
||||
def test_get_text_with_azure_error_logged_and_returns_none(
|
||||
self,
|
||||
mock_client_cls,
|
||||
) -> None:
|
||||
mock_client = mock.Mock()
|
||||
mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
with override_settings(
|
||||
REMOTE_OCR_ENGINE="azureai",
|
||||
REMOTE_OCR_API_KEY="somekey",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
||||
):
|
||||
parser = get_parser(uuid.uuid4())
|
||||
with mock.patch.object(parser.log, "error") as mock_log_error:
|
||||
parser.parse(
|
||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertIsNone(parser.text)
|
||||
mock_client.begin_analyze_document.assert_called_once()
|
||||
mock_client.close.assert_called_once()
|
||||
mock_log_error.assert_called_once()
|
||||
self.assertIn(
|
||||
"Azure AI Vision parsing failed",
|
||||
mock_log_error.call_args[0][0],
|
||||
)
|
||||
|
||||
@override_settings(
|
||||
REMOTE_OCR_ENGINE="azureai",
|
||||
REMOTE_OCR_API_KEY="key",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
||||
)
|
||||
def test_supported_mime_types_valid_config(self) -> None:
|
||||
parser = RemoteDocumentParser(uuid.uuid4())
|
||||
expected_types = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/tiff": ".tiff",
|
||||
"image/bmp": ".bmp",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
self.assertEqual(parser.supported_mime_types(), expected_types)
|
||||
|
||||
def test_supported_mime_types_invalid_config(self) -> None:
|
||||
parser = get_parser(uuid.uuid4())
|
||||
self.assertEqual(parser.supported_mime_types(), {})
|
||||
|
||||
@override_settings(
|
||||
REMOTE_OCR_ENGINE=None,
|
||||
REMOTE_OCR_API_KEY=None,
|
||||
REMOTE_OCR_ENDPOINT=None,
|
||||
)
|
||||
def test_parse_with_invalid_config(self) -> None:
|
||||
parser = get_parser(uuid.uuid4())
|
||||
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
|
||||
self.assertEqual(parser.text, "")
|
||||
@@ -1,10 +1,23 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
|
||||
# RasterisedDocumentParser accepts logging_group for constructor compatibility but
|
||||
# does not store or use it (no legacy DocumentParser base class).
|
||||
# progress_callback is also not used. Both may arrive as a positional arg
|
||||
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
||||
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||
kwargs.pop("logging_group", None)
|
||||
kwargs.pop("progress_callback", None)
|
||||
return RasterisedDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def tesseract_consumer_declaration(sender, **kwargs):
|
||||
def tesseract_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 0,
|
||||
|
||||
@@ -1,924 +0,0 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
import unicodedata
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import run_convert
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless_tesseract.parsers import post_process_text
|
||||
|
||||
|
||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
||||
|
||||
def assertContainsStrings(self, content, strings) -> None:
|
||||
# Asserts that all strings appear in content, in the given order.
|
||||
indices = []
|
||||
for s in strings:
|
||||
if s in content:
|
||||
indices.append(content.index(s))
|
||||
else:
|
||||
self.fail(f"'{s}' is not in '{content}'")
|
||||
self.assertListEqual(indices, sorted(indices))
|
||||
|
||||
def test_post_process_text(self) -> None:
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
("simple newline\n testing string", "simple newline\ntesting string"),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце",
|
||||
),
|
||||
]
|
||||
|
||||
for source, result in text_cases:
|
||||
actual_result = post_process_text(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
f"strip_exceess_whitespace({source}) != '{result}', but '{actual_result}'",
|
||||
)
|
||||
|
||||
def test_get_text_from_pdf(self) -> None:
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
text = parser.extract_text(
|
||||
None,
|
||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||
)
|
||||
|
||||
self.assertContainsStrings(text.strip(), ["This is a test document."])
|
||||
|
||||
def test_get_page_count(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- PDF file with a single page
|
||||
- PDF file with multiple pages
|
||||
WHEN:
|
||||
- The number of pages is requested
|
||||
THEN:
|
||||
- The method returns 1 as the expected number of pages
|
||||
- The method returns the correct number of pages (6)
|
||||
"""
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
page_count = parser.get_page_count(
|
||||
str(self.SAMPLE_FILES / "simple-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertEqual(page_count, 1)
|
||||
|
||||
page_count = parser.get_page_count(
|
||||
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertEqual(page_count, 6)
|
||||
|
||||
def test_get_page_count_password_protected(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Password protected PDF file
|
||||
WHEN:
|
||||
- The number of pages is requested
|
||||
THEN:
|
||||
- The method returns None
|
||||
"""
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
|
||||
page_count = parser.get_page_count(
|
||||
str(self.SAMPLE_FILES / "password-protected.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertEqual(page_count, None)
|
||||
self.assertIn("Unable to determine PDF page count", cm.output[0])
|
||||
|
||||
def test_thumbnail(self) -> None:
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
thumb = parser.get_thumbnail(
|
||||
str(self.SAMPLE_FILES / "simple-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsFile(thumb)
|
||||
|
||||
@mock.patch("documents.parsers.run_convert")
|
||||
def test_thumbnail_fallback(self, m) -> None:
|
||||
def call_convert(input_file, output_file, **kwargs) -> None:
|
||||
if ".pdf" in str(input_file):
|
||||
raise ParseError("Does not compute.")
|
||||
else:
|
||||
run_convert(input_file=input_file, output_file=output_file, **kwargs)
|
||||
|
||||
m.side_effect = call_convert
|
||||
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
thumb = parser.get_thumbnail(
|
||||
str(self.SAMPLE_FILES / "simple-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsFile(thumb)
|
||||
|
||||
def test_thumbnail_encrypted(self) -> None:
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
thumb = parser.get_thumbnail(
|
||||
str(self.SAMPLE_FILES / "encrypted.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsFile(thumb)
|
||||
|
||||
def test_get_dpi(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple-no-dpi.png"))
|
||||
self.assertEqual(dpi, None)
|
||||
|
||||
dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple.png"))
|
||||
self.assertEqual(dpi, 72)
|
||||
|
||||
def test_simple_digital(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "simple-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertIsFile(parser.archive_path)
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
|
||||
|
||||
def test_with_form(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "with-form.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertIsFile(parser.archive_path)
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.get_text(),
|
||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
||||
)
|
||||
|
||||
@override_settings(OCR_MODE="redo")
|
||||
def test_with_form_error(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "with-form.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text(),
|
||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
||||
)
|
||||
|
||||
@override_settings(OCR_MODE="skip")
|
||||
def test_signed(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(str(self.SAMPLE_FILES / "signed.pdf"), "application/pdf")
|
||||
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text(),
|
||||
[
|
||||
"This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable",
|
||||
"automated testing of signed/encrypted PDFs",
|
||||
],
|
||||
)
|
||||
|
||||
@override_settings(OCR_MODE="skip")
|
||||
def test_encrypted(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "encrypted.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertEqual(parser.get_text(), "")
|
||||
|
||||
@override_settings(OCR_MODE="redo")
|
||||
def test_with_form_error_notext(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "with-form.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.get_text(),
|
||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
||||
)
|
||||
|
||||
@override_settings(OCR_MODE="force")
|
||||
def test_with_form_force(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "with-form.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.get_text(),
|
||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
||||
)
|
||||
|
||||
def test_image_simple(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(str(self.SAMPLE_FILES / "simple.png"), "image/png")
|
||||
|
||||
self.assertIsFile(parser.archive_path)
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
|
||||
|
||||
def test_image_simple_alpha(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
# Copy sample file to temp directory, as the parsing changes the file
|
||||
# and this makes it modified to Git
|
||||
sample_file = self.SAMPLE_FILES / "simple-alpha.png"
|
||||
dest_file = Path(tempdir) / "simple-alpha.png"
|
||||
shutil.copy(sample_file, dest_file)
|
||||
|
||||
parser.parse(str(dest_file), "image/png")
|
||||
|
||||
self.assertIsFile(parser.archive_path)
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
|
||||
|
||||
def test_image_calc_a4_dpi(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
dpi = parser.calculate_a4_dpi(
|
||||
str(self.SAMPLE_FILES / "simple-no-dpi.png"),
|
||||
)
|
||||
|
||||
self.assertEqual(dpi, 62)
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
|
||||
def test_image_dpi_fail(self, m) -> None:
|
||||
m.return_value = None
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def f() -> None:
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "simple-no-dpi.png"),
|
||||
"image/png",
|
||||
)
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
@override_settings(OCR_IMAGE_DPI=72, MAX_IMAGE_PIXELS=0)
|
||||
def test_image_no_dpi_default(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png")
|
||||
|
||||
self.assertIsFile(parser.archive_path)
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["this is a test document."],
|
||||
)
|
||||
|
||||
def test_multi_page(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="skip")
|
||||
def test_multi_page_pages_skip(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
|
||||
def test_multi_page_pages_redo(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="force")
|
||||
def test_multi_page_pages_force(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_MODE="skip")
|
||||
def test_multi_page_analog_pages_skip(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
|
||||
def test_multi_page_analog_pages_redo(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text contained in images but no text layer
|
||||
- OCR of only pages 1 and 2 requested
|
||||
- OCR mode set to redo
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text of page 1 and 2 extracted
|
||||
- An archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
|
||||
self.assertNotIn("page 3", parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_PAGES=1, OCR_MODE="force")
|
||||
def test_multi_page_analog_pages_force(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text contained in images but no text layer
|
||||
- OCR of only page 1 requested
|
||||
- OCR mode set to force
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Only text of page 1 is extracted
|
||||
- An archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
|
||||
self.assertNotIn("page 2", parser.get_text().lower())
|
||||
self.assertNotIn("page 3", parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_skip_noarchive_withtext(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- OCR mode set to skip_noarchive
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- No archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_skip_noarchive_notext(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text contained in images but no text layer
|
||||
- OCR mode set to skip_noarchive
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- An archive file is created with the OCRd text
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
self.assertIsNotNone(parser.archive_path)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
|
||||
def test_skip_archive_never_withtext(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to never
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from text layer is extracted
|
||||
- Archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNotNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
|
||||
def test_skip_archive_never_withimages(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text contained in images but no text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to never
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- Archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNotNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
|
||||
def test_skip_archive_withtext_withtext(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to with_text
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from text layer is extracted
|
||||
- No archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
|
||||
def test_skip_archive_withtext_withimages(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text contained in images but no text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to with_text
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- Archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNotNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
|
||||
def test_skip_archive_always_withtext(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to always
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from text layer is extracted
|
||||
- No archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
|
||||
def test_skip_archive_always_withimages(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text contained in images but no text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to always
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- No archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_MODE="skip")
|
||||
def test_multi_page_mixed(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with some text contained in images and some in text layer
|
||||
- OCR mode set to skip
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- An archive file is created with the OCRd text and the original text
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNotNone(parser.archive_path)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
|
||||
)
|
||||
|
||||
with (parser.tempdir / "sidecar.txt").open() as f:
|
||||
sidecar = f.read()
|
||||
|
||||
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
|
||||
|
||||
@override_settings(OCR_MODE="redo")
|
||||
def test_single_page_mixed(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with some text contained in images and some in text layer
|
||||
- Text and images are mixed on the same page
|
||||
- OCR mode set to redo
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- Full content of the file is parsed (not just the image text)
|
||||
- An archive file is created with the OCRd text and the original text
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "single-page-mixed.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNotNone(parser.archive_path)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
[
|
||||
"this is some normal text, present on page 1 of the document.",
|
||||
"this is some text, but in an image, also on page 1.",
|
||||
"this is further text on page 1.",
|
||||
],
|
||||
)
|
||||
|
||||
with (parser.tempdir / "sidecar.txt").open() as f:
|
||||
sidecar = f.read().lower()
|
||||
|
||||
self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
|
||||
self.assertNotIn(
|
||||
"this is some normal text, present on page 1 of the document.",
|
||||
sidecar,
|
||||
)
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_multi_page_mixed_no_archive(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with some text contained in images and some in text layer
|
||||
- OCR mode set to skip_noarchive
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- No archive file is created as original file contains text
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 4", "page 5", "page 6"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
|
||||
def test_rotate(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(str(self.SAMPLE_FILES / "rotated.pdf"), "application/pdf")
|
||||
self.assertContainsStrings(
|
||||
parser.get_text(),
|
||||
[
|
||||
"This is the text that appears on the first page. It’s a lot of text.",
|
||||
"Even if the pages are rotated, OCRmyPDF still gets the job done.",
|
||||
"This is a really weird file with lots of nonsense text.",
|
||||
"If you read this, it’s your own fault. Also check your screen orientation.",
|
||||
],
|
||||
)
|
||||
|
||||
def test_multi_page_tiff(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Multi-page TIFF image
|
||||
WHEN:
|
||||
- Image is parsed
|
||||
THEN:
|
||||
- Text from all pages extracted
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "multi-page-images.tiff"),
|
||||
"image/tiff",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
def test_multi_page_tiff_alpha(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Multi-page TIFF image
|
||||
- Image include an alpha channel
|
||||
WHEN:
|
||||
- Image is parsed
|
||||
THEN:
|
||||
- Text from all pages extracted
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff"
|
||||
with tempfile.NamedTemporaryFile() as tmp_file:
|
||||
shutil.copy(sample_file, tmp_file.name)
|
||||
parser.parse(
|
||||
tmp_file.name,
|
||||
"image/tiff",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
def test_multi_page_tiff_alpha_srgb(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Multi-page TIFF image
|
||||
- Image include an alpha channel
|
||||
- Image is srgb colorspace
|
||||
WHEN:
|
||||
- Image is parsed
|
||||
THEN:
|
||||
- Text from all pages extracted
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
sample_file = str(
|
||||
self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff",
|
||||
)
|
||||
with tempfile.NamedTemporaryFile() as tmp_file:
|
||||
shutil.copy(sample_file, tmp_file.name)
|
||||
parser.parse(
|
||||
tmp_file.name,
|
||||
"image/tiff",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
def test_ocrmypdf_parameters(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters(
|
||||
input_file="input.pdf",
|
||||
output_file="output.pdf",
|
||||
sidecar_file="sidecar.txt",
|
||||
mime_type="application/pdf",
|
||||
safe_fallback=False,
|
||||
)
|
||||
|
||||
self.assertEqual(params["input_file_or_options"], "input.pdf")
|
||||
self.assertEqual(params["output_file"], "output.pdf")
|
||||
self.assertEqual(params["sidecar"], "sidecar.txt")
|
||||
|
||||
with override_settings(OCR_CLEAN="none"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertNotIn("clean", params)
|
||||
self.assertNotIn("clean_final", params)
|
||||
|
||||
with override_settings(OCR_CLEAN="clean"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertTrue(params["clean"])
|
||||
self.assertNotIn("clean_final", params)
|
||||
|
||||
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertTrue(params["clean_final"])
|
||||
self.assertNotIn("clean", params)
|
||||
|
||||
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertTrue(params["clean"])
|
||||
self.assertNotIn("clean_final", params)
|
||||
|
||||
with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertTrue(params["deskew"])
|
||||
|
||||
with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertNotIn("deskew", params)
|
||||
|
||||
with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertNotIn("deskew", params)
|
||||
|
||||
with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertIn("max_image_mpixels", params)
|
||||
self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
|
||||
|
||||
with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertNotIn("max_image_mpixels", params)
|
||||
|
||||
def test_rtl_language_detection(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text in an RTL language
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from the document is extracted
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "rtl-test.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
# OCR output for RTL text varies across platforms/versions due to
|
||||
# bidi controls and presentation forms; normalize before assertion.
|
||||
normalized_text = "".join(
|
||||
char
|
||||
for char in unicodedata.normalize("NFKC", parser.get_text())
|
||||
if unicodedata.category(char) != "Cf" and not char.isspace()
|
||||
)
|
||||
|
||||
self.assertIn("ةرازو", normalized_text)
|
||||
self.assertTrue(
|
||||
any(token in normalized_text for token in ("ةیلخادلا", "الاخليد")),
|
||||
)
|
||||
|
||||
@mock.patch("ocrmypdf.ocr")
|
||||
def test_gs_rendering_error(self, m) -> None:
|
||||
m.side_effect = SubprocessOutputError("Ghostscript PDF/A rendering failed")
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
self.assertRaises(
|
||||
ParseError,
|
||||
parser.parse,
|
||||
str(self.SAMPLE_FILES / "simple-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
|
||||
class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_FILES = Path(__file__).parent / "samples"
|
||||
|
||||
def test_bmp(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(str(self.SAMPLE_FILES / "simple.bmp"), "image/bmp")
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertIn("this is a test document", parser.get_text().lower())
|
||||
|
||||
def test_jpg(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(str(self.SAMPLE_FILES / "simple.jpg"), "image/jpeg")
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertIn("this is a test document", parser.get_text().lower())
|
||||
|
||||
def test_heic(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(str(self.SAMPLE_FILES / "simple.heic"), "image/heic")
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertIn("pizza", parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_IMAGE_DPI=200)
|
||||
def test_gif(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(str(self.SAMPLE_FILES / "simple.gif"), "image/gif")
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertIn("this is a test document", parser.get_text().lower())
|
||||
|
||||
def test_tiff(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(str(self.SAMPLE_FILES / "simple.tif"), "image/tiff")
|
||||
self.assertIsFile(parser.archive_path)
|
||||
self.assertIn("this is a test document", parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_IMAGE_DPI=72)
|
||||
def test_webp(self) -> None:
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
str(self.SAMPLE_FILES / "document.webp"),
|
||||
"image/webp",
|
||||
)
|
||||
self.assertIsFile(parser.archive_path)
|
||||
# Older tesseracts consistently mangle the space between "a webp",
|
||||
# tesseract 5.3.0 seems to do a better job, so we're accepting both
|
||||
self.assertRegex(
|
||||
parser.get_text().lower(),
|
||||
r"this is a ?webp document, created 11/14/2022.",
|
||||
)
|
||||
@@ -1,4 +1,9 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
# TextDocumentParser accepts logging_group for constructor compatibility but
|
||||
@@ -9,10 +14,10 @@ def get_parser(*args, **kwargs):
|
||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||
kwargs.pop("logging_group", None)
|
||||
kwargs.pop("progress_callback", None)
|
||||
return TextDocumentParser()
|
||||
return TextDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def text_consumer_declaration(sender, **kwargs):
|
||||
def text_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 10,
|
||||
|
||||
28
uv.lock
generated
@@ -4754,11 +4754,11 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "tinytag"
|
||||
version = "2.2.0"
|
||||
version = "2.2.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/98/07/fb260bac73119f369a10e884016516d07cd760b5068e703773f83dd5e7bf/tinytag-2.2.0.tar.gz", hash = "sha256:f15b082510f6e0fc717e597edc8759d6f2d3ff6194ac0f3bcd675a9a09d9b798", size = 38120, upload-time = "2025-12-15T21:10:19.093Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/96/59/8a8cb2331e2602b53e4dc06960f57d1387a2b18e7efd24e5f9cb60ea4925/tinytag-2.2.1.tar.gz", hash = "sha256:e6d06610ebe7cd66fd07be2d3b9495914ab32654a5e47657bb8cd44c2484523c", size = 38214, upload-time = "2026-03-15T18:48:01.11Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b1/e2/9818fcebb348237389d2ac2fea97cf2b2638378a0866105a45ae9be49728/tinytag-2.2.0-py3-none-any.whl", hash = "sha256:d2cf3ef8ee0f6c854663f77d9d5f8159ee1c834c70f5ea4f214ddc4af8148f79", size = 32861, upload-time = "2025-12-15T21:10:17.63Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/34/d50e338631baaf65ec5396e70085e5de0b52b24b28db1ffbc1c6e82190dc/tinytag-2.2.1-py3-none-any.whl", hash = "sha256:ed8b1e6d25367937e3321e054f4974f9abfde1a3e0a538824c87da377130c2b6", size = 32927, upload-time = "2026-03-15T18:47:59.613Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5643,7 +5643,7 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "zensical"
|
||||
version = "0.0.25"
|
||||
version = "0.0.26"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -5653,16 +5653,18 @@ dependencies = [
|
||||
{ name = "pymdown-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pyyaml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/18/69/4b49ce778059b4888ea854cf4db40e1b2080fe828b7280198999048d6fce/zensical-0.0.25.tar.gz", hash = "sha256:462808359d949469fa7209d367f2e38ed796744074e5dadeac9ddfef0c44be25", size = 3841318, upload-time = "2026-03-10T19:32:35.048Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d5/1f/0a0b1ce8e0553a9dabaedc736d0f34b11fc33d71ff46bce44d674996d41f/zensical-0.0.26.tar.gz", hash = "sha256:f4d9c8403df25fbb3d6dd9577122dc2f23c73a2d16ab778bb7d40370dd71e987", size = 3841473, upload-time = "2026-03-11T09:51:38.838Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/42/7c/f6f5eb1903b5a557d98f48d09e3d4bc33033ed78508986250dabe5387d91/zensical-0.0.25-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3c481dd16a968f97d43f6b596e10e941d8294ed446b8b117235a6b149c0d6965", size = 12263809, upload-time = "2026-03-10T19:31:49.907Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/37/b2/3f8be43526a68c52c84f099887d1903c2526a22aa4344378a72671bf6070/zensical-0.0.25-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ae51751e8b11f50df04641b40c1e07d4b703fed9d9548b16dbcb0cf260da229a", size = 12146107, upload-time = "2026-03-10T19:31:53.576Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/16/59/89a3a715b1fe538b4b5ee382d71b86bd06d4f351383e36eefd36e824c150/zensical-0.0.25-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56ccf88245bd0b3684bf313384164972f1890802d4a51dd9b7ae6ea126a810bc", size = 12505963, upload-time = "2026-03-10T19:31:57.517Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ad/5b/cc0bada291818bdf36be777af9c16f655a021f16578a31e6fb233affca03/zensical-0.0.25-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2f4e58bcc06f3e50cc518666a0c9d8f82246255a42b37bb1d7c7343e214fbac", size = 12455496, upload-time = "2026-03-10T19:32:02.37Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e9/16/ff91ee42d8b14a1b63e2e0d74922e6c4b0ec1da3819377f20b7ca2742f76/zensical-0.0.25-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:69895273b1319a45667abac543c3e5065ff2a646d9a698eae056b6a35b57e00a", size = 12683609, upload-time = "2026-03-10T19:32:06.144Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/01/fd/a85acc4234d31658f4bb54c4900edfc8d4227ad83e4c79de92cfdcd05c79/zensical-0.0.25-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:c51a00ae1de2e9647bfd0ea1965b223fb3891111a00930416e1277e06f3ab3c4", size = 12725420, upload-time = "2026-03-10T19:32:09.938Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/37/c7/896c91e457af3d5769d8d70d2bd66a8a287ad129879b51ab5e985ac68889/zensical-0.0.25-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:28e56ec1f06ea66227c1f5af9d7a6ed3bd4246e6af1e45d29e09f40251b52e1f", size = 12861970, upload-time = "2026-03-10T19:32:13.471Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/06/5d804cf19e4e093394674d9f213546dc1364a34fd85d81a1153b05733c5a/zensical-0.0.25-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d2d5997baad148b65eb0de6baf81973110538e01a3f64467d06d0c5ac23b0d70", size = 12816321, upload-time = "2026-03-10T19:32:17.031Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/58/fa3d9538ff1ea8cf4a193edbf47254f374fa7983fcfa876bb4336d72c53a/zensical-0.0.26-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:7823b25afe7d36099253aa59d643abaac940f80fd015d4a37954210c87d3da56", size = 12263607, upload-time = "2026-03-11T09:50:49.202Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5f/6e/44a3b21bd3569b9cad203364d73a956768d28a879e4c2be91bd889f74d2c/zensical-0.0.26-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:c0254814382cdd3769bc7689180d09bf41de8879871dd736dc52d5f141e8ada7", size = 12144562, upload-time = "2026-03-11T09:50:53.685Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/07/ae/31b9885745b3e7ef23a3ae7f175b879807288d11b3fb7e2d3c119c916258/zensical-0.0.26-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c8e601b2bbd239e564b04cf235eefb9777e7dfc7e1857b8871d6cdcfb577aa0", size = 12506728, upload-time = "2026-03-11T09:50:57.775Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bd/93/f5291e2c47076474f181f6eef35ef0428117d3f192da4358c0511e2ce09e/zensical-0.0.26-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2dc43c7e6c25d9724fc0450f0273ca4e5e2506eeb7f89f52f1405a592896ca3b", size = 12454975, upload-time = "2026-03-11T09:51:01.514Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/aa/2e/61cac4f2ebad31dab768eb02753ffde9e56d4d34b8f876b949bf516fbd50/zensical-0.0.26-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24ed236d1254cc474c19227eaa3670a1ccf921af53134ec5542b05853bdcd59c", size = 12791930, upload-time = "2026-03-11T09:51:05.162Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/02/86/51995d1ed2dd6ad8a1a70bcdf3c5eb16b50e62ea70e638d454a6b9061c4d/zensical-0.0.26-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1110147710d1dd025d932c4a7eada836bdf079c91b70fb0ae5b202e14b094617", size = 12548166, upload-time = "2026-03-11T09:51:09.218Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3d/93/decbafdbfc77170cbc3851464632390846e9aaf45e743c8dd5a24d5673e9/zensical-0.0.26-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:7d21596a785428cdebc20859bd94a05334abe14ad24f1bb9cd80d19219e3c220", size = 12682103, upload-time = "2026-03-11T09:51:12.68Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fb/e2/391d2d08dde621177da069a796a886b549fefb15734aeeb6e696af99b662/zensical-0.0.26-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:680a3c7bb71499b4da784d6072e44b3d7b8c0df3ce9bbd9974e24bd8058c2736", size = 12724219, upload-time = "2026-03-11T09:51:17.32Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/80/2a/21b40c5c40a67da8a841f278d61dbd8d5e035e489de6fe1cef5f4e211b4f/zensical-0.0.26-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:e3294a79f98218b6fc2219232e166aa0932ae4dad58f6c8dbc0dbe0ecbff9c25", size = 12862117, upload-time = "2026-03-11T09:51:22.161Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/51/76/e1910d6d75d207654c867b8efbda6822dedda9fed3601bf4a864a1f4fe26/zensical-0.0.26-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:630229587df1fb47be184a4a69d0772ce59a44cd2c481ae9f7e8852fffaff11e", size = 12815714, upload-time = "2026-03-11T09:51:26.24Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||